All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3 v7] MSI-X MMIO support for KVM
@ 2011-01-06 10:19 Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Change from v6:
1. Discard PBA support. But we can still add it later.
2. Fix one memory reference bug
3. Add automatically MMIO unregister after device was deassigned.
4. Update according to Avi's comments.
5. Add documents for new API.

Notice this patchset depends on two PCI patches named:

PCI: MSI: Move MSI-X entry definition to pci_regs.h
PCI: Add mask bit definition for MSI-X table

These two patches are in the Jesse's pci-2.6 tree. Do I need to repost them?

Sheng Yang (3):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt |   41 +++++++
 arch/x86/kvm/Makefile     |    2 +-
 arch/x86/kvm/x86.c        |    8 +-
 include/linux/kvm.h       |   21 ++++
 include/linux/kvm_host.h  |   48 ++++++++
 virt/kvm/assigned-dev.c   |   44 +++++++
 virt/kvm/iodev.h          |   25 +----
 virt/kvm/kvm_main.c       |   38 ++++++-
 virt/kvm/msix_mmio.c      |  284 +++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h      |   25 ++++
 10 files changed, 505 insertions(+), 31 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-17 11:54   ` Marcelo Tosatti
  2011-01-17 12:29   ` Avi Kivity
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
  2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  3 siblings, 2 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   21 ++++
 include/linux/kvm_host.h |   25 ++++
 virt/kvm/assigned-dev.c  |   44 +++++++
 virt/kvm/kvm_main.c      |   38 ++++++-
 virt/kvm/msix_mmio.c     |  284 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   25 ++++
 8 files changed, 440 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
+		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -281,6 +302,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..7444dcd 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..b7807c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..dc6e8ca
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,284 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xf;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+	u32 *ctrl_pos;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xF;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* No allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
+	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
+		ret = -ENOTSYNC;
+	if (old_ctrl == new_ctrl)
+		goto out;
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
+	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
+	if (r || ret)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+	}
+
+	mmio_dev->mmio_nr++;
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = 0, i, j;
+	bool found = 0;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			found = true;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	if (!found)
+		r = -EINVAL;
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-17 12:21   ` Avi Kivity
  2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  3 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..4978b94 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/3 v7] MSI-X MMIO support for KVM
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
                   ` (2 preceding siblings ...)
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
@ 2011-01-12  2:23 ` Sheng Yang
  3 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-12  2:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Thursday 06 January 2011 18:19:42 Sheng Yang wrote:
> Change from v6:
> 1. Discard PBA support. But we can still add it later.
> 2. Fix one memory reference bug
> 3. Add automatically MMIO unregister after device was deassigned.
> 4. Update according to Avi's comments.
> 5. Add documents for new API.

Avi?

--
regards
Yang, Sheng

> 
> Notice this patchset depends on two PCI patches named:
> 
> PCI: MSI: Move MSI-X entry definition to pci_regs.h
> PCI: Add mask bit definition for MSI-X table
> 
> These two patches are in the Jesse's pci-2.6 tree. Do I need to repost
> them?
> 
> Sheng Yang (3):
>   KVM: Move struct kvm_io_device to kvm_host.h
>   KVM: Emulate MSI-X table in kernel
>   KVM: Add documents for MSI-X MMIO API
> 
>  Documentation/kvm/api.txt |   41 +++++++
>  arch/x86/kvm/Makefile     |    2 +-
>  arch/x86/kvm/x86.c        |    8 +-
>  include/linux/kvm.h       |   21 ++++
>  include/linux/kvm_host.h  |   48 ++++++++
>  virt/kvm/assigned-dev.c   |   44 +++++++
>  virt/kvm/iodev.h          |   25 +----
>  virt/kvm/kvm_main.c       |   38 ++++++-
>  virt/kvm/msix_mmio.c      |  284
> +++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h      | 
>  25 ++++
>  10 files changed, 505 insertions(+), 31 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-01-17 11:54   ` Marcelo Tosatti
  2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:29   ` Avi Kivity
  1 sibling, 1 reply; 45+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 11:54 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   21 ++++
>  include/linux/kvm_host.h |   25 ++++
>  virt/kvm/assigned-dev.c  |   44 +++++++
>  virt/kvm/kvm_main.c      |   38 ++++++-
>  virt/kvm/msix_mmio.c     |  284 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h     |   25 ++++
>  8 files changed, 440 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..7444dcd 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}

Should check if the host irq is registered as MSIX type.

> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..b7807c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..dc6e8ca
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,284 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xF;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;

Instead of copying to/from userspace (which is subject to swapin,
unexpected values), you could include the guest written value in a
kvm_run structure, along with address. Qemu-kvm would use that to
synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> +		ret = -ENOTSYNC;
> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);

Then you rely on the kernel copy of the values to enable/disable irq.

> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +	}
> +
> +	mmio_dev->mmio_nr++;
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = 0, i, j;
> +	bool found = 0;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);

Its valid for mmio_nr to be equal to KVM_MSIX_MMIO_MAX.

> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			found = true;
> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		r = -EINVAL;
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}

This is not consistent with registration, where there are no checks
regarding validity of assigned device id. So why is it necessary?

There is a lock ordering problem BTW: 

- read/write handlers: dev->lock -> kvm->lock
- vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock

> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 11:54   ` Marcelo Tosatti
@ 2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:18       ` Avi Kivity
  2011-01-17 12:39       ` Marcelo Tosatti
  0 siblings, 2 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-17 12:18 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Monday 17 January 2011 19:54:47 Marcelo Tosatti wrote:
> On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > ---
> > 
> >  arch/x86/kvm/Makefile    |    2 +-
> >  arch/x86/kvm/x86.c       |    8 +-
> >  include/linux/kvm.h      |   21 ++++
> >  include/linux/kvm_host.h |   25 ++++
> >  virt/kvm/assigned-dev.c  |   44 +++++++
> >  virt/kvm/kvm_main.c      |   38 ++++++-
> >  virt/kvm/msix_mmio.c     |  284
> >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> >  |   25 ++++
> >  8 files changed, 440 insertions(+), 7 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..7444dcd 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> 
> Should check if the host irq is registered as MSIX type.
> 
> > +out:
> > +	mutex_unlock(&kvm->lock);
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index b1b6cbb..b7807c8 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> > 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..dc6e8ca
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,284 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > +	return -EFAULT;
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xf;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +	u32 *ctrl_pos;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xF;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* No allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > +		goto out;
> 
> Instead of copying to/from userspace (which is subject to swapin,
> unexpected values), you could include the guest written value in a
> kvm_run structure, along with address. Qemu-kvm would use that to
> synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in 
the most condition. That's the cost we want to optimize. Also it's possible to 
userspace to read the correct value of MMIO(but mostly userspace can't write to it 
in order to prevent synchronize issue).
> 
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > +		ret = -ENOTSYNC;
> > +	if (old_ctrl == new_ctrl)
> > +		goto out;
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> 
> Then you rely on the kernel copy of the values to enable/disable irq.

Yes, they are guest owned assigned IRQs. Any potential issue?
> 
> > +	if (r || ret)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +	}
> > +
> > +	mmio_dev->mmio_nr++;
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = 0, i, j;
> > +	bool found = 0;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);
> 
> Its valid for mmio_nr to be equal to KVM_MSIX_MMIO_MAX.
> 
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			found = true;
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	if (!found)
> > +		r = -EINVAL;
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> 
> This is not consistent with registration, where there are no checks
> regarding validity of assigned device id. So why is it necessary?

I am not quite understand. We need to free mmio anyway, otherwise it would result 
in wrong mmio interception...

> 
> There is a lock ordering problem BTW:
> 
> - read/write handlers: dev->lock -> kvm->lock
> - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock

Good catch! Would fix it(and other comments of course).

--
regards
Yang, Sheng

> 
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18     ` Sheng Yang
@ 2011-01-17 12:18       ` Avi Kivity
  2011-01-17 12:48         ` Marcelo Tosatti
  2011-01-17 12:39       ` Marcelo Tosatti
  1 sibling, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 12:18 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >  >  +
> >  >  +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >  >  +		goto out;
> >
> >  Instead of copying to/from userspace (which is subject to swapin,
> >  unexpected values), you could include the guest written value in a
> >  kvm_run structure, along with address. Qemu-kvm would use that to
> >  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
>
> We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> the most condition. That's the cost we want to optimize. Also it's possible to
> userspace to read the correct value of MMIO(but mostly userspace can't write to it
> in order to prevent synchronize issue).

It's also good to have the values in just one place; using userspace 
makes it easy for both the kernel and userspace to see the values (and 
set them after migration, if/when we extend this to virtio).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
@ 2011-01-17 12:21   ` Avi Kivity
  2011-01-17 12:35     ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 12:21 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/06/2011 12:19 PM, Sheng Yang wrote:
> Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> ---
>   Documentation/kvm/api.txt |   41 +++++++++++++++++++++++++++++++++++++++++
>   1 files changed, 41 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> index e1a9297..4978b94 100644
> --- a/Documentation/kvm/api.txt
> +++ b/Documentation/kvm/api.txt
> @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
>   	__u16 padding[3];
>   };
>
> +4.54 KVM_REGISTER_MSIX_MMIO
> +
> +Capability: KVM_CAP_MSIX_MMIO
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_msix_mmio_user (in)
> +Returns: 0 on success, -1 on error
> +
> +This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
> +operation would be handled by kernel. When necessary(e.g. MSI data/address
> +changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
> +indicate the MMIO modification and require userspace to update IRQ routing
> +table.
> +
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;		/* Device type and MMIO address type */
> +	__u16 max_entries_nr;	/* Maximum entries supported */
> +	__u64 base_addr;	/* Guest physical address of MMIO */
> +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> +	__u64 flags;		/* Reserved for now */
> +	__u64 reserved[4];
> +};
> +
> +Current device type can be:
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> +
> +Current MMIO type can be:
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> +

How does userspace know which entry of which table changed?  Need a 
field in struct kvm_run for that.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-01-17 11:54   ` Marcelo Tosatti
@ 2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
                       ` (2 more replies)
  1 sibling, 3 replies; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 12:29 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/06/2011 12:19 PM, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
>
>
>
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i<  adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);

Is it okay to call disable_irq_nosync() here?  IIRC we don't check the 
mask bit on irq delivery, so we may forward an interrupt to the guest 
after the mask bit was set.

What does pci say about the mask bit?  when does it take effect?

Another question is whether disable_irq_nosync() actually programs the 
device mask bit, or not.  If it does, then it's slow, and it may be 
better to leave interrupts enabled but have an internal pending bit.  If 
it doesn't program the mask bit, it's fine.

> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
>
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;
> +
> +	offset = addr&  0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> +		goto out;
> +
> +	mmio =&mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;

and return ret == 0?

> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;
> +
> +	offset = addr&  0xF;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> +		goto out;
> +
> +	mmio =&mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;

here, too.

> +
> +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> +		ret = -ENOTSYNC;

goto out?

> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:21   ` Avi Kivity
@ 2011-01-17 12:35     ` Sheng Yang
  2011-01-17 12:45       ` Avi Kivity
  0 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-01-17 12:35 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> > Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> > ---
> > 
> >   Documentation/kvm/api.txt |   41
> >   +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> >   insertions(+), 0 deletions(-)
> > 
> > diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> > index e1a9297..4978b94 100644
> > --- a/Documentation/kvm/api.txt
> > +++ b/Documentation/kvm/api.txt
> > @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> > 
> >   	__u16 padding[3];
> >   
> >   };
> > 
> > +4.54 KVM_REGISTER_MSIX_MMIO
> > +
> > +Capability: KVM_CAP_MSIX_MMIO
> > +Architectures: x86
> > +Type: vm ioctl
> > +Parameters: struct kvm_msix_mmio_user (in)
> > +Returns: 0 on success, -1 on error
> > +
> > +This API indicates an MSI-X MMIO address of a guest device. Then all
> > MMIO +operation would be handled by kernel. When necessary(e.g. MSI
> > data/address +changed), KVM would exit to userspace using
> > KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> > require userspace to update IRQ routing +table.
> > +
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;		/* Device type and MMIO address type */
> > +	__u16 max_entries_nr;	/* Maximum entries supported */
> > +	__u64 base_addr;	/* Guest physical address of MMIO */
> > +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> > +	__u64 flags;		/* Reserved for now */
> > +	__u64 reserved[4];
> > +};
> > +
> > +Current device type can be:
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > +
> > +Current MMIO type can be:
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > +
> 
> How does userspace know which entry of which table changed?  Need a
> field in struct kvm_run for that.

We already got an guest MMIO address for that in the exit information. I've 
created a chain of handler in qemu to handle it.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:18       ` Avi Kivity
@ 2011-01-17 12:39       ` Marcelo Tosatti
  2011-01-19  8:37         ` Sheng Yang
  1 sibling, 1 reply; 45+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 12:39 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > +		goto out;
> > 
> > Instead of copying to/from userspace (which is subject to swapin,
> > unexpected values), you could include the guest written value in a
> > kvm_run structure, along with address. Qemu-kvm would use that to
> > synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> 
> We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in 
> the most condition. That's the cost we want to optimize. Also it's possible to 
> userspace to read the correct value of MMIO(but mostly userspace can't write to it 
> in order to prevent synchronize issue).

Yes, i meant exit to userspace only when necessary, but instead of
copying directly everytime record the value the guest has written in
kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.

> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;
> > > +	if (old_ctrl == new_ctrl)
> > > +		goto out;
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > 
> > Then you rely on the kernel copy of the values to enable/disable irq.
> 
> Yes, they are guest owned assigned IRQs. Any potential issue?

Nothing prevents the kernel from enabling or disabling irq multiple
times with this code (what prevents it is a qemu-kvm that behaves as
expected). This is not very good.

Perhaps the guest can only harm itself with that, but i'm not sure.

And also if an msix table page is swapped out guest will hang.

> > > +	return r;
> > > +}
> > 
> > This is not consistent with registration, where there are no checks
> > regarding validity of assigned device id. So why is it necessary?
> 
> I am not quite understand. We need to free mmio anyway, otherwise it would result 
> in wrong mmio interception...

If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
read/write paths, there is no need to free anything.

> 
> > 
> > There is a lock ordering problem BTW:
> > 
> > - read/write handlers: dev->lock -> kvm->lock
> > - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock
> 
> Good catch! Would fix it(and other comments of course).
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:35     ` Sheng Yang
@ 2011-01-17 12:45       ` Avi Kivity
  2011-01-19  8:21         ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 12:45 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/17/2011 02:35 PM, Sheng Yang wrote:
> On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >  >  Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> >  >  ---
> >  >
> >  >    Documentation/kvm/api.txt |   41
> >  >    +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> >  >    insertions(+), 0 deletions(-)
> >  >
> >  >  diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> >  >  index e1a9297..4978b94 100644
> >  >  --- a/Documentation/kvm/api.txt
> >  >  +++ b/Documentation/kvm/api.txt
> >  >  @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> >  >
> >  >    	__u16 padding[3];
> >  >
> >  >    };
> >  >
> >  >  +4.54 KVM_REGISTER_MSIX_MMIO
> >  >  +
> >  >  +Capability: KVM_CAP_MSIX_MMIO
> >  >  +Architectures: x86
> >  >  +Type: vm ioctl
> >  >  +Parameters: struct kvm_msix_mmio_user (in)
> >  >  +Returns: 0 on success, -1 on error
> >  >  +
> >  >  +This API indicates an MSI-X MMIO address of a guest device. Then all
> >  >  MMIO +operation would be handled by kernel. When necessary(e.g. MSI
> >  >  data/address +changed), KVM would exit to userspace using
> >  >  KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> >  >  require userspace to update IRQ routing +table.
> >  >  +
> >  >  +struct kvm_msix_mmio_user {
> >  >  +	__u32 dev_id;
> >  >  +	__u16 type;		/* Device type and MMIO address type */
> >  >  +	__u16 max_entries_nr;	/* Maximum entries supported */
> >  >  +	__u64 base_addr;	/* Guest physical address of MMIO */
> >  >  +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> >  >  +	__u64 flags;		/* Reserved for now */
> >  >  +	__u64 reserved[4];
> >  >  +};
> >  >  +
> >  >  +Current device type can be:
> >  >  +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<   0)
> >  >  +
> >  >  +Current MMIO type can be:
> >  >  +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<   8)
> >  >  +
> >
> >  How does userspace know which entry of which table changed?  Need a
> >  field in struct kvm_run for that.
>
> We already got an guest MMIO address for that in the exit information. I've
> created a chain of handler in qemu to handle it.
>

But we already decoded the table and entry...

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18       ` Avi Kivity
@ 2011-01-17 12:48         ` Marcelo Tosatti
  2011-01-17 12:51           ` Avi Kivity
  0 siblings, 1 reply; 45+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 12:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >>  >  +
> >>  >  +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >>  >  +		goto out;
> >>
> >>  Instead of copying to/from userspace (which is subject to swapin,
> >>  unexpected values), you could include the guest written value in a
> >>  kvm_run structure, along with address. Qemu-kvm would use that to
> >>  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >
> >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >the most condition. That's the cost we want to optimize. Also it's possible to
> >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >in order to prevent synchronize issue).
> 
> It's also good to have the values in just one place; using userspace
> makes it easy for both the kernel and userspace to see the values
> (and set them after migration, if/when we extend this to virtio).

Right, thats an advantage, but:

- How can userspace ever synchronize with updates by the kernel 
  to the MSI-X entry?
- Reading/writing to the userspace area must be done carefully,
  values must be validated before used.
- Swapping issue (minor?).



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:48         ` Marcelo Tosatti
@ 2011-01-17 12:51           ` Avi Kivity
  2011-01-17 15:52             ` Marcelo Tosatti
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 12:51 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:
> On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> >  On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >  >>   >   +
> >  >>   >   +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >  >>   >   +		goto out;
> >  >>
> >  >>   Instead of copying to/from userspace (which is subject to swapin,
> >  >>   unexpected values), you could include the guest written value in a
> >  >>   kvm_run structure, along with address. Qemu-kvm would use that to
> >  >>   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >  >
> >  >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >  >the most condition. That's the cost we want to optimize. Also it's possible to
> >  >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >  >in order to prevent synchronize issue).
> >
> >  It's also good to have the values in just one place; using userspace
> >  makes it easy for both the kernel and userspace to see the values
> >  (and set them after migration, if/when we extend this to virtio).
>
> Right, thats an advantage, but:
>
> - How can userspace ever synchronize with updates by the kernel
>    to the MSI-X entry?

What a value is written by the guest, which kvm cannot handle itself 
(i.e. a change to anything other than the mask bit), we exit with the 
table and entry ids, so userspace can reread them.

> - Reading/writing to the userspace area must be done carefully,
>    values must be validated before used.

True every time...

> - Swapping issue (minor?).

I don't see the issue... just like any part of qemu that may be swapped 
out, blocking the vcpu thread.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
@ 2011-01-17 13:31     ` Michael S. Tsirkin
  2011-01-17 13:47     ` Jan Kiszka
  2011-01-30  4:38     ` Sheng Yang
  2 siblings, 0 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-01-17 13:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >Then we can support mask bit operation of assigned devices now.
> >
> >
> >
> >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >+				int assigned_dev_id, int entry, bool mask)
> >+{
> >+	int r = -EFAULT;
> >+	struct kvm_assigned_dev_kernel *adev;
> >+	int i;
> >+
> >+	if (!irqchip_in_kernel(kvm))
> >+		return r;
> >+
> >+	mutex_lock(&kvm->lock);
> >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >+				      assigned_dev_id);
> >+	if (!adev)
> >+		goto out;
> >+
> >+	for (i = 0; i<  adev->entries_nr; i++)
> >+		if (adev->host_msix_entries[i].entry == entry) {
> >+			if (mask)
> >+				disable_irq_nosync(
> >+					adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> the mask bit on irq delivery, so we may forward an interrupt to the
> guest after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?

It is only guaranteed to take effect on the next read.
In practice, it takes effect earlier and guests
rely on this as an optimization.

> Another question is whether disable_irq_nosync() actually programs
> the device mask bit, or not.  If it does, then it's slow, and it may
> be better to leave interrupts enabled but have an internal pending
> bit.  If it doesn't program the mask bit, it's fine.
> 
> >+			else
> >+				enable_irq(adev->host_msix_entries[i].vector);
> >+			r = 0;
> >+			break;
> >+		}
> >+out:
> >+	mutex_unlock(&kvm->lock);
> >+	return r;
> >+}
> >
> >+
> >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >+				void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, ret = 0, entry, offset, r;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xf;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >+	if (r)
> >+		goto out;
> 
> and return ret == 0?
> 
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> >+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> >+				int len, const void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, entry, offset, ret = 0, r = 0;
> >+	gpa_t entry_base;
> >+	u32 old_ctrl, new_ctrl;
> >+	u32 *ctrl_pos;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xF;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> >+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> >+
> >+	if (get_user(old_ctrl, ctrl_pos))
> >+		goto out;
> >+
> >+	/* No allow writing to other fields when entry is unmasked */
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> >+		goto out;
> >+
> >+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >+		goto out;
> >+
> >+	if (get_user(new_ctrl, ctrl_pos))
> >+		goto out;
> 
> here, too.
> 
> >+
> >+	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> >+	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> >+		ret = -ENOTSYNC;
> 
> goto out?
> 
> >+	if (old_ctrl == new_ctrl)
> >+		goto out;
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> >+	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> >+	if (r || ret)
> >+		ret = -ENOTSYNC;
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> 
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
@ 2011-01-17 13:47     ` Jan Kiszka
  2011-01-30  4:38     ` Sheng Yang
  2 siblings, 0 replies; 45+ messages in thread
From: Jan Kiszka @ 2011-01-17 13:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, Michael S. Tsirkin, kvm

On 2011-01-17 13:29, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
>> Then we can support mask bit operation of assigned devices now.
>>
>>
>>
>> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
>> +                int assigned_dev_id, int entry, bool mask)
>> +{
>> +    int r = -EFAULT;
>> +    struct kvm_assigned_dev_kernel *adev;
>> +    int i;
>> +
>> +    if (!irqchip_in_kernel(kvm))
>> +        return r;
>> +
>> +    mutex_lock(&kvm->lock);
>> +    adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
>> +                      assigned_dev_id);
>> +    if (!adev)
>> +        goto out;
>> +
>> +    for (i = 0; i<  adev->entries_nr; i++)
>> +        if (adev->host_msix_entries[i].entry == entry) {
>> +            if (mask)
>> +                disable_irq_nosync(
>> +                    adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check the
> mask bit on irq delivery, so we may forward an interrupt to the guest
> after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?
> 
> Another question is whether disable_irq_nosync() actually programs the
> device mask bit, or not.  If it does, then it's slow, and it may be
> better to leave interrupts enabled but have an internal pending bit.  If
> it doesn't program the mask bit, it's fine.

disable_irq* works lazily, only disabling the line in software. If an
IRQ actually occurs, it is marked pending and the line is left masked on
handler return (that's one reason why masking via PCI config space is
slower at macro level).

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:51           ` Avi Kivity
@ 2011-01-17 15:52             ` Marcelo Tosatti
  2011-01-17 16:01               ` Avi Kivity
  0 siblings, 1 reply; 45+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 15:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 02:51:37PM +0200, Avi Kivity wrote:
> On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:
> >On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> >>  On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >>  >>   >   +
> >>  >>   >   +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >>  >>   >   +		goto out;
> >>  >>
> >>  >>   Instead of copying to/from userspace (which is subject to swapin,
> >>  >>   unexpected values), you could include the guest written value in a
> >>  >>   kvm_run structure, along with address. Qemu-kvm would use that to
> >>  >>   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >>  >
> >>  >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >>  >the most condition. That's the cost we want to optimize. Also it's possible to
> >>  >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >>  >in order to prevent synchronize issue).
> >>
> >>  It's also good to have the values in just one place; using userspace
> >>  makes it easy for both the kernel and userspace to see the values
> >>  (and set them after migration, if/when we extend this to virtio).
> >
> >Right, thats an advantage, but:
> >
> >- How can userspace ever synchronize with updates by the kernel
> >   to the MSI-X entry?
> 
> What a value is written by the guest, which kvm cannot handle itself
> (i.e. a change to anything other than the mask bit), we exit with
> the table and entry ids, so userspace can reread them.

OK. But regarding access to the MSI-X entry in userspace, it can 
only be accessed safely wrt parallel updates by the kernel in the
exit handler.

Is the exit handler the only location where the MSI-X entry will be
read or written to, in userspace?

> >- Reading/writing to the userspace area must be done carefully,
> >   values must be validated before used.
> 
> True every time...
> 
> >- Swapping issue (minor?).
> 
> I don't see the issue... just like any part of qemu that may be
> swapped out, blocking the vcpu thread.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 15:52             ` Marcelo Tosatti
@ 2011-01-17 16:01               ` Avi Kivity
  0 siblings, 0 replies; 45+ messages in thread
From: Avi Kivity @ 2011-01-17 16:01 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On 01/17/2011 05:52 PM, Marcelo Tosatti wrote:
> >
> >  What a value is written by the guest, which kvm cannot handle itself
> >  (i.e. a change to anything other than the mask bit), we exit with
> >  the table and entry ids, so userspace can reread them.
>
> OK. But regarding access to the MSI-X entry in userspace, it can
> only be accessed safely wrt parallel updates by the kernel in the
> exit handler.
>
> Is the exit handler the only location where the MSI-X entry will be
> read or written to, in userspace?
>

Yes.  This should be documented, that the kernel owns the msix area as 
far as writes are concerned.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:45       ` Avi Kivity
@ 2011-01-19  8:21         ` Sheng Yang
  2011-01-25 12:47           ` Avi Kivity
  0 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-01-19  8:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:45:55 Avi Kivity wrote:
> On 01/17/2011 02:35 PM, Sheng Yang wrote:
> > On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> > >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> > >  >  Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> > >  >  ---
> > >  >  
> > >  >    Documentation/kvm/api.txt |   41
> > >  >    +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> > >  >    insertions(+), 0 deletions(-)
> > >  >  
> > >  >  diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> > >  >  index e1a9297..4978b94 100644
> > >  >  --- a/Documentation/kvm/api.txt
> > >  >  +++ b/Documentation/kvm/api.txt
> > >  >  @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> > >  >  
> > >  >    	__u16 padding[3];
> > >  >    
> > >  >    };
> > >  >  
> > >  >  +4.54 KVM_REGISTER_MSIX_MMIO
> > >  >  +
> > >  >  +Capability: KVM_CAP_MSIX_MMIO
> > >  >  +Architectures: x86
> > >  >  +Type: vm ioctl
> > >  >  +Parameters: struct kvm_msix_mmio_user (in)
> > >  >  +Returns: 0 on success, -1 on error
> > >  >  +
> > >  >  +This API indicates an MSI-X MMIO address of a guest device. Then
> > >  >  all MMIO +operation would be handled by kernel. When
> > >  >  necessary(e.g. MSI data/address +changed), KVM would exit to
> > >  >  userspace using
> > >  >  KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> > >  >  require userspace to update IRQ routing +table.
> > >  >  +
> > >  >  +struct kvm_msix_mmio_user {
> > >  >  +	__u32 dev_id;
> > >  >  +	__u16 type;		/* Device type and MMIO address type */
> > >  >  +	__u16 max_entries_nr;	/* Maximum entries supported */
> > >  >  +	__u64 base_addr;	/* Guest physical address of MMIO */
> > >  >  +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> > >  >  +	__u64 flags;		/* Reserved for now */
> > >  >  +	__u64 reserved[4];
> > >  >  +};
> > >  >  +
> > >  >  +Current device type can be:
> > >  >  +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<   0)
> > >  >  +
> > >  >  +Current MMIO type can be:
> > >  >  +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<   8)
> > >  >  +
> > >  
> > >  How does userspace know which entry of which table changed?  Need a
> > >  field in struct kvm_run for that.
> > 
> > We already got an guest MMIO address for that in the exit information.
> > I've created a chain of handler in qemu to handle it.
> 
> But we already decoded the table and entry...

But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO. So it's 
not quite handy to get the table and entry out. Also the updater in the userspace 
can share the most logic with ordinary userspace MMIO handler, which take address 
as parameter. So I think we don't need to pass the decoded table_id and entry to 
userspace.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:39       ` Marcelo Tosatti
@ 2011-01-19  8:37         ` Sheng Yang
  0 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-19  8:37 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:39:30 Marcelo Tosatti wrote:
> On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > +		goto out;
> > > 
> > > Instead of copying to/from userspace (which is subject to swapin,
> > > unexpected values), you could include the guest written value in a
> > > kvm_run structure, along with address. Qemu-kvm would use that to
> > > synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE
> > > exit.
> > 
> > We want to acelerate MSI-X mask bit accessing, which won't exit to
> > userspace in the most condition. That's the cost we want to optimize.
> > Also it's possible to userspace to read the correct value of MMIO(but
> > mostly userspace can't write to it in order to prevent synchronize
> > issue).
> 
> Yes, i meant exit to userspace only when necessary, but instead of
> copying directly everytime record the value the guest has written in
> kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.
> 
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> > > > +	if (old_ctrl == new_ctrl)
> > > > +		goto out;
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > 
> > > Then you rely on the kernel copy of the values to enable/disable irq.
> > 
> > Yes, they are guest owned assigned IRQs. Any potential issue?
> 
> Nothing prevents the kernel from enabling or disabling irq multiple
> times with this code (what prevents it is a qemu-kvm that behaves as
> expected). This is not very good.
> 
> Perhaps the guest can only harm itself with that, but i'm not sure.

MSI-X interrupts are not shared, so I think guest can only harm itself if it was 
doing it wrong.
> 
> And also if an msix table page is swapped out guest will hang.
> 
> > > > +	return r;
> > > > +}
> > > 
> > > This is not consistent with registration, where there are no checks
> > > regarding validity of assigned device id. So why is it necessary?
> > 
> > I am not quite understand. We need to free mmio anyway, otherwise it
> > would result in wrong mmio interception...
> 
> If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
> read/write paths, there is no need to free anything.

It may work with assigned devices, but the potential user of this is including vfio 
drivers and emulate devices. And I don't think it's a good idea to have 
registeration process but no free process...

--
regards
Yang, Sheng

> 
> > > There is a lock ordering problem BTW:
> > > 
> > > - read/write handlers: dev->lock -> kvm->lock
> > > - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock ->
> > > dev->lock
> > 
> > Good catch! Would fix it(and other comments of course).
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-19  8:21         ` Sheng Yang
@ 2011-01-25 12:47           ` Avi Kivity
  2011-01-26  9:05             ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-25 12:47 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >  >
> >  >  We already got an guest MMIO address for that in the exit information.
> >  >  I've created a chain of handler in qemu to handle it.
> >
> >  But we already decoded the table and entry...
>
> But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO. So it's
> not quite handy to get the table and entry out.

The kernel handler can create a new kvm_run exit description.

>   Also the updater in the userspace
> can share the most logic with ordinary userspace MMIO handler, which take address
> as parameter. So I think we don't need to pass the decoded table_id and entry to
> userspace.

It's mixing layers, which always leads to trouble.  For one, the user 
handler shouldn't do anything with the write since the kernel already 
wrote it into the table.  For another, if two vcpus write to the same 
entry simultaneously, you could see different ordering in the kernel and 
userspace, and get inconsistent results.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-25 12:47           ` Avi Kivity
@ 2011-01-26  9:05             ` Sheng Yang
  2011-01-31 13:24               ` Avi Kivity
  0 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-01-26  9:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> On 01/19/2011 10:21 AM, Sheng Yang wrote:
> > >  >  We already got an guest MMIO address for that in the exit
> > >  >  information. I've created a chain of handler in qemu to handle it.
> > >  
> > >  But we already decoded the table and entry...
> > 
> > But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> > So it's not quite handy to get the table and entry out.
> 
> The kernel handler can create a new kvm_run exit description.
> 
> >   Also the updater in the userspace
> > 
> > can share the most logic with ordinary userspace MMIO handler, which take
> > address as parameter. So I think we don't need to pass the decoded
> > table_id and entry to userspace.
> 
> It's mixing layers, which always leads to trouble.  For one, the user
> handler shouldn't do anything with the write since the kernel already
> wrote it into the table.  For another, if two vcpus write to the same
> entry simultaneously, you could see different ordering in the kernel and
> userspace, and get inconsistent results.

The shared logic is not about writing, but about interpret what's written. Old 
MMIO handler would write the data, then interpret it; and our new MMIO would only 
share the logic of interpretation. I think that's fair enough?

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
  2011-01-17 13:47     ` Jan Kiszka
@ 2011-01-30  4:38     ` Sheng Yang
  2011-01-31 13:09       ` Avi Kivity
  2 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-01-30  4:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

(Sorry, missed this mail...)

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >Then we can support mask bit operation of assigned devices now.
> >
> >
> >
> >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >+				int assigned_dev_id, int entry, bool mask)
> >+{
> >+	int r = -EFAULT;
> >+	struct kvm_assigned_dev_kernel *adev;
> >+	int i;
> >+
> >+	if (!irqchip_in_kernel(kvm))
> >+		return r;
> >+
> >+	mutex_lock(&kvm->lock);
> >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >+				      assigned_dev_id);
> >+	if (!adev)
> >+		goto out;
> >+
> >+	for (i = 0; i<  adev->entries_nr; i++)
> >+		if (adev->host_msix_entries[i].entry == entry) {
> >+			if (mask)
> >+				disable_irq_nosync(
> >+					adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> the mask bit on irq delivery, so we may forward an interrupt to the
> guest after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?
> 
> Another question is whether disable_irq_nosync() actually programs
> the device mask bit, or not.  If it does, then it's slow, and it may
> be better to leave interrupts enabled but have an internal pending
> bit.  If it doesn't program the mask bit, it's fine.

I think Michael and Jan had explained this.
> 
> >+			else
> >+				enable_irq(adev->host_msix_entries[i].vector);
> >+			r = 0;
> >+			break;
> >+		}
> >+out:
> >+	mutex_unlock(&kvm->lock);
> >+	return r;
> >+}
> >
> >+
> >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >+				void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, ret = 0, entry, offset, r;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xf;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >+	if (r)
> >+		goto out;
> 
> and return ret == 0?

Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
in order to omit this action. We can add warning to it later.
> 
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> >+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> >+				int len, const void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, entry, offset, ret = 0, r = 0;
> >+	gpa_t entry_base;
> >+	u32 old_ctrl, new_ctrl;
> >+	u32 *ctrl_pos;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xF;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> >+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> >+
> >+	if (get_user(old_ctrl, ctrl_pos))
> >+		goto out;
> >+
> >+	/* No allow writing to other fields when entry is unmasked */
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> >+		goto out;
> >+
> >+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >+		goto out;
> >+
> >+	if (get_user(new_ctrl, ctrl_pos))
> >+		goto out;
> 
> here, too.

The same as above.
> 
> >+
> >+	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> >+	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> >+		ret = -ENOTSYNC;
> 
> goto out?

No. This judgement only check if MSI data/address was touched. And the line
below would check if we need to operate mask bit. Because in theory guest can
use len=8 to modify MSI-X data and ctrl at the same time.

-- 
regards
Yang, Sheng
> 
> >+	if (old_ctrl == new_ctrl)
> >+		goto out;
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> >+	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> >+	if (r || ret)
> >+		ret = -ENOTSYNC;
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> 
> -- 
> error compiling committee.c: too many arguments to function
> 

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  4:38     ` Sheng Yang
@ 2011-01-31 13:09       ` Avi Kivity
  2011-02-01  4:21         ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-31 13:09 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/30/2011 06:38 AM, Sheng Yang wrote:
> (Sorry, missed this mail...)
>
> On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >  >Then we can support mask bit operation of assigned devices now.
> >  >
> >  >
> >  >
> >  >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >  >+				int assigned_dev_id, int entry, bool mask)
> >  >+{
> >  >+	int r = -EFAULT;
> >  >+	struct kvm_assigned_dev_kernel *adev;
> >  >+	int i;
> >  >+
> >  >+	if (!irqchip_in_kernel(kvm))
> >  >+		return r;
> >  >+
> >  >+	mutex_lock(&kvm->lock);
> >  >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >  >+				      assigned_dev_id);
> >  >+	if (!adev)
> >  >+		goto out;
> >  >+
> >  >+	for (i = 0; i<   adev->entries_nr; i++)
> >  >+		if (adev->host_msix_entries[i].entry == entry) {
> >  >+			if (mask)
> >  >+				disable_irq_nosync(
> >  >+					adev->host_msix_entries[i].vector);
> >
> >  Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> >  the mask bit on irq delivery, so we may forward an interrupt to the
> >  guest after the mask bit was set.
> >
> >  What does pci say about the mask bit?  when does it take effect?
> >
> >  Another question is whether disable_irq_nosync() actually programs
> >  the device mask bit, or not.  If it does, then it's slow, and it may
> >  be better to leave interrupts enabled but have an internal pending
> >  bit.  If it doesn't program the mask bit, it's fine.
>
> I think Michael and Jan had explained this.
> >
> >  >+			else
> >  >+				enable_irq(adev->host_msix_entries[i].vector);
> >  >+			r = 0;
> >  >+			break;
> >  >+		}
> >  >+out:
> >  >+	mutex_unlock(&kvm->lock);
> >  >+	return r;
> >  >+}
> >  >
> >  >+
> >  >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  >+				void *val)
> >  >+{
> >  >+	struct kvm_msix_mmio_dev *mmio_dev =
> >  >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >  >+	struct kvm_msix_mmio *mmio;
> >  >+	int idx, ret = 0, entry, offset, r;
> >  >+
> >  >+	mutex_lock(&mmio_dev->lock);
> >  >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >  >+	if (idx<   0) {
> >  >+		ret = -EOPNOTSUPP;
> >  >+		goto out;
> >  >+	}
> >  >+	if ((addr&   0x3) || (len != 4&&   len != 8))
> >  >+		goto out;
> >  >+
> >  >+	offset = addr&   0xf;
> >  >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 8)
> >  >+		goto out;
> >  >+
> >  >+	mmio =&mmio_dev->mmio[idx];
> >  >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >  >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >  >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >  >+	if (r)
> >  >+		goto out;
> >
> >  and return ret == 0?
>
> Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
> in order to omit this action. We can add warning to it later.

But it failed.  We need to return -EFAULT.

> The same as above.
> >
> >  >+
> >  >+	if ((offset<   PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 4) ||
> >  >+	    (offset<   PCI_MSIX_ENTRY_DATA&&   len == 8))
> >  >+		ret = -ENOTSYNC;
> >
> >  goto out?
>
> No. This judgement only check if MSI data/address was touched. And the line
> below would check if we need to operate mask bit. Because in theory guest can
> use len=8 to modify MSI-X data and ctrl at the same time.
>

Ok, makes sense.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-26  9:05             ` Sheng Yang
@ 2011-01-31 13:24               ` Avi Kivity
  2011-02-01  4:26                 ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Avi Kivity @ 2011-01-31 13:24 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/26/2011 11:05 AM, Sheng Yang wrote:
> On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> >  On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >  >  >   >   We already got an guest MMIO address for that in the exit
> >  >  >   >   information. I've created a chain of handler in qemu to handle it.
> >  >  >
> >  >  >   But we already decoded the table and entry...
> >  >
> >  >  But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> >  >  So it's not quite handy to get the table and entry out.
> >
> >  The kernel handler can create a new kvm_run exit description.
> >
> >  >    Also the updater in the userspace
> >  >
> >  >  can share the most logic with ordinary userspace MMIO handler, which take
> >  >  address as parameter. So I think we don't need to pass the decoded
> >  >  table_id and entry to userspace.
> >
> >  It's mixing layers, which always leads to trouble.  For one, the user
> >  handler shouldn't do anything with the write since the kernel already
> >  wrote it into the table.  For another, if two vcpus write to the same
> >  entry simultaneously, you could see different ordering in the kernel and
> >  userspace, and get inconsistent results.
>
> The shared logic is not about writing, but about interpret what's written. Old
> MMIO handler would write the data, then interpret it; and our new MMIO would only
> share the logic of interpretation. I think that's fair enough?

It dosn't make sense for an API point of view.  You registered a table 
of entries, you expect an exit on that table to point to the table and 
entry that got changed.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-31 13:09       ` Avi Kivity
@ 2011-02-01  4:21         ` Sheng Yang
  0 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-02-01  4:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Mon, Jan 31, 2011 at 03:09:09PM +0200, Avi Kivity wrote:
> On 01/30/2011 06:38 AM, Sheng Yang wrote:
> >(Sorry, missed this mail...)
> >
> >On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> >>  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >>  >Then we can support mask bit operation of assigned devices now.
> >>  >
> >>  >
> >>  >
> >>  >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >>  >+				int assigned_dev_id, int entry, bool mask)
> >>  >+{
> >>  >+	int r = -EFAULT;
> >>  >+	struct kvm_assigned_dev_kernel *adev;
> >>  >+	int i;
> >>  >+
> >>  >+	if (!irqchip_in_kernel(kvm))
> >>  >+		return r;
> >>  >+
> >>  >+	mutex_lock(&kvm->lock);
> >>  >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >>  >+				      assigned_dev_id);
> >>  >+	if (!adev)
> >>  >+		goto out;
> >>  >+
> >>  >+	for (i = 0; i<   adev->entries_nr; i++)
> >>  >+		if (adev->host_msix_entries[i].entry == entry) {
> >>  >+			if (mask)
> >>  >+				disable_irq_nosync(
> >>  >+					adev->host_msix_entries[i].vector);
> >>
> >>  Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> >>  the mask bit on irq delivery, so we may forward an interrupt to the
> >>  guest after the mask bit was set.
> >>
> >>  What does pci say about the mask bit?  when does it take effect?
> >>
> >>  Another question is whether disable_irq_nosync() actually programs
> >>  the device mask bit, or not.  If it does, then it's slow, and it may
> >>  be better to leave interrupts enabled but have an internal pending
> >>  bit.  If it doesn't program the mask bit, it's fine.
> >
> >I think Michael and Jan had explained this.
> >>
> >>  >+			else
> >>  >+				enable_irq(adev->host_msix_entries[i].vector);
> >>  >+			r = 0;
> >>  >+			break;
> >>  >+		}
> >>  >+out:
> >>  >+	mutex_unlock(&kvm->lock);
> >>  >+	return r;
> >>  >+}
> >>  >
> >>  >+
> >>  >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >>  >+				void *val)
> >>  >+{
> >>  >+	struct kvm_msix_mmio_dev *mmio_dev =
> >>  >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >>  >+	struct kvm_msix_mmio *mmio;
> >>  >+	int idx, ret = 0, entry, offset, r;
> >>  >+
> >>  >+	mutex_lock(&mmio_dev->lock);
> >>  >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >>  >+	if (idx<   0) {
> >>  >+		ret = -EOPNOTSUPP;
> >>  >+		goto out;
> >>  >+	}
> >>  >+	if ((addr&   0x3) || (len != 4&&   len != 8))
> >>  >+		goto out;
> >>  >+
> >>  >+	offset = addr&   0xf;
> >>  >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 8)
> >>  >+		goto out;
> >>  >+
> >>  >+	mmio =&mmio_dev->mmio[idx];
> >>  >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >>  >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >>  >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >>  >+	if (r)
> >>  >+		goto out;
> >>
> >>  and return ret == 0?
> >
> >Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
> >in order to omit this action. We can add warning to it later.
> 
> But it failed.  We need to return -EFAULT.

So it would return to QEmu. OK, let QEmu prints warning about it.

-- 
regards
Yang, Sheng
> 
> >The same as above.
> >>
> >>  >+
> >>  >+	if ((offset<   PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 4) ||
> >>  >+	    (offset<   PCI_MSIX_ENTRY_DATA&&   len == 8))
> >>  >+		ret = -ENOTSYNC;
> >>
> >>  goto out?
> >
> >No. This judgement only check if MSI data/address was touched. And the line
> >below would check if we need to operate mask bit. Because in theory guest can
> >use len=8 to modify MSI-X data and ctrl at the same time.
> >
> 
> Ok, makes sense.
> 
> -- 
> error compiling committee.c: too many arguments to function
> 


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-31 13:24               ` Avi Kivity
@ 2011-02-01  4:26                 ` Sheng Yang
  0 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-02-01  4:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Mon, Jan 31, 2011 at 03:24:27PM +0200, Avi Kivity wrote:
> On 01/26/2011 11:05 AM, Sheng Yang wrote:
> >On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> >>  On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >>  >  >   >   We already got an guest MMIO address for that in the exit
> >>  >  >   >   information. I've created a chain of handler in qemu to handle it.
> >>  >  >
> >>  >  >   But we already decoded the table and entry...
> >>  >
> >>  >  But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> >>  >  So it's not quite handy to get the table and entry out.
> >>
> >>  The kernel handler can create a new kvm_run exit description.
> >>
> >>  >    Also the updater in the userspace
> >>  >
> >>  >  can share the most logic with ordinary userspace MMIO handler, which take
> >>  >  address as parameter. So I think we don't need to pass the decoded
> >>  >  table_id and entry to userspace.
> >>
> >>  It's mixing layers, which always leads to trouble.  For one, the user
> >>  handler shouldn't do anything with the write since the kernel already
> >>  wrote it into the table.  For another, if two vcpus write to the same
> >>  entry simultaneously, you could see different ordering in the kernel and
> >>  userspace, and get inconsistent results.
> >
> >The shared logic is not about writing, but about interpret what's written. Old
> >MMIO handler would write the data, then interpret it; and our new MMIO would only
> >share the logic of interpretation. I think that's fair enough?
> 
> It dosn't make sense for an API point of view.  You registered a
> table of entries, you expect an exit on that table to point to the
> table and entry that got changed.

OK, I would update this when I come back to work(about two weeks later...).

-- 
regards
Yang, Sheng

> 
> -- 
> error compiling committee.c: too many arguments to function
> 


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-25  6:12             ` Sheng Yang
@ 2011-02-25  8:46               ` Michael S. Tsirkin
  0 siblings, 0 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-02-25  8:46 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Fri, Feb 25, 2011 at 02:12:42PM +0800, Sheng Yang wrote:
> On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
> > On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > > 
> > > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > > just need to be able to communicate disabling and enabling of
> > > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > > it. Thanks,
> > > > > 
> > > > > Alex, thanks for your comments. See my comments below:
> > > > > > Alex
> > > > > > 
> > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > > ---
> > > > > > > 
> > > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  virt/kvm/msix_mmio.h
> > > > > > >  
> > > > > > >  |   25 ++++
> > > > > > >  
> > > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > > 
> > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > > index f15501f..3a0d851 100644
> > > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > > 
> > > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
> ioapic.o
> > > 
> > > \
> > > 
> > > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > > 
> > > > > > > -				assigned-dev.o)
> > > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > > 
> > > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > > 
> > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > > index fa708c9..89bf12c 100644
> > > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > > 
> > > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > > >  	case KVM_CAP_XSAVE:
> > > > > > > 
> > > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > > >  		r = 1;
> > > > > > >  		break;
> > > > > > >  	
> > > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > 
> > > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > > >  
> > > > > > >  {
> > > > > > >  
> > > > > > >  	gpa_t                 gpa;
> > > > > > > 
> > > > > > > +	int r;
> > > > > > > 
> > > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > > 
> > > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > 
> > > > > > >  mmio:
> > > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > > 
> > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > 
> > > > > > >  	/*
> > > > > > >  	
> > > > > > >  	 * Is this MMIO handled locally?
> > > > > > >  	 */
> > > > > > > 
> > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > +	if (!r)
> > > > > > > 
> > > > > > >  		return X86EMUL_CONTINUE;
> > > > > > >  	
> > > > > > >  	vcpu->mmio_needed = 1;
> > > > > > > 
> > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > 
> > > > This use of -ENOTSYNC is IMO confusing.
> > > > How about we make vcpu_mmio_write return the positive exit reason?
> > > > Negative value will mean an error.
> > > 
> > > In fact currently nagative value means something more need to be done,
> > > the same as MMIO exit.
> > 
> > So it would be
> > 	if (!r)
> >   		return X86EMUL_CONTINUE;
> > 
> > 	vcpu->run->exit_reason = r;
> > 
> > > Now I think we can keep it, or update them all later.
> > 
> > The way to do this would be
> > 1. patch to return KVM_EXIT_MMIO on mmio
> > 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top
> 
> It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 
> because KVM itself unable to complete the request. That's quite straightforward. 
> But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can 
> only know when all of them fail to handle the accessing.
> 

Well, this just violates the standard API for return value.

The standard semantics are:
- negative - error
- positive - not an error

So you can just return a positive KVM_EXIT_MSIX_ROUTING_UPDATE
instead of ENOTSYNC. This way you do not need to rework all code
and you don't spread MSIX all over kvm.

You just do:
	(r > 0) ? r : KVM_EXIT_MMIO


> I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead 
> of a error return.
> We can discuss more on this, but since it's not API/ABI change, 
> I think we can get the patch in first.
> --
> regards
> Yang, Sheng
> 
> > 
> > > --
> > > regards
> > > Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:17           ` Michael S. Tsirkin
@ 2011-02-25  6:12             ` Sheng Yang
  2011-02-25  8:46               ` Michael S. Tsirkin
  0 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-02-25  6:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > just need to be able to communicate disabling and enabling of
> > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > it. Thanks,
> > > > 
> > > > Alex, thanks for your comments. See my comments below:
> > > > > Alex
> > > > > 
> > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > ---
> > > > > > 
> > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  virt/kvm/msix_mmio.h
> > > > > >  
> > > > > >  |   25 ++++
> > > > > >  
> > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > index f15501f..3a0d851 100644
> > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > 
> > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o
> > 
> > \
> > 
> > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > 
> > > > > > -				assigned-dev.o)
> > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > 
> > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index fa708c9..89bf12c 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > 
> > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > >  	case KVM_CAP_XSAVE:
> > > > > > 
> > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > >  		r = 1;
> > > > > >  		break;
> > > > > >  	
> > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	gpa_t                 gpa;
> > > > > > 
> > > > > > +	int r;
> > > > > > 
> > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > 
> > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  mmio:
> > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > 
> > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > 
> > > > > >  	/*
> > > > > >  	
> > > > > >  	 * Is this MMIO handled locally?
> > > > > >  	 */
> > > > > > 
> > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > +	if (!r)
> > > > > > 
> > > > > >  		return X86EMUL_CONTINUE;
> > > > > >  	
> > > > > >  	vcpu->mmio_needed = 1;
> > > > > > 
> > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > 
> > > This use of -ENOTSYNC is IMO confusing.
> > > How about we make vcpu_mmio_write return the positive exit reason?
> > > Negative value will mean an error.
> > 
> > In fact currently nagative value means something more need to be done,
> > the same as MMIO exit.
> 
> So it would be
> 	if (!r)
>   		return X86EMUL_CONTINUE;
> 
> 	vcpu->run->exit_reason = r;
> 
> > Now I think we can keep it, or update them all later.
> 
> The way to do this would be
> 1. patch to return KVM_EXIT_MMIO on mmio
> 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 
because KVM itself unable to complete the request. That's quite straightforward. 
But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can 
only know when all of them fail to handle the accessing.

I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead 
of a error return. We can discuss more on this, but since it's not API/ABI change, 
I think we can get the patch in first.

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:11           ` Michael S. Tsirkin
@ 2011-02-25  5:54             ` Sheng Yang
  0 siblings, 0 replies; 45+ messages in thread
From: Sheng Yang @ 2011-02-25  5:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thursday 24 February 2011 18:11:44 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > just need to be able to communicate disabling and enabling of
> > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > it. Thanks,
> > > > 
> > > > Alex, thanks for your comments. See my comments below:
> > > > > Alex
> > > > > 
> > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > ---
> > > > > > 
> > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  virt/kvm/msix_mmio.h
> > > > > >  
> > > > > >  |   25 ++++
> > > > > >  
> > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > index f15501f..3a0d851 100644
> > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > 
> > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o
> > 
> > \
> > 
> > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > 
> > > > > > -				assigned-dev.o)
> > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > 
> > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index fa708c9..89bf12c 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > 
> > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > >  	case KVM_CAP_XSAVE:
> > > > > > 
> > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > >  		r = 1;
> > > > > >  		break;
> > > > > >  	
> > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	gpa_t                 gpa;
> > > > > > 
> > > > > > +	int r;
> > > > > > 
> > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > 
> > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  mmio:
> > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > 
> > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > 
> > > > > >  	/*
> > > > > >  	
> > > > > >  	 * Is this MMIO handled locally?
> > > > > >  	 */
> > > > > > 
> > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > +	if (!r)
> > > > > > 
> > > > > >  		return X86EMUL_CONTINUE;
> > > > > >  	
> > > > > >  	vcpu->mmio_needed = 1;
> > > > > > 
> > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > 
> > > This use of -ENOTSYNC is IMO confusing.
> > > How about we make vcpu_mmio_write return the positive exit reason?
> > > Negative value will mean an error.
> > 
> > Make sense. I would update it.
> > 
> > > > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > > > 
> > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > > index ea2dc1a..ad9df4b 100644
> > > > > > --- a/include/linux/kvm.h
> > > > > > +++ b/include/linux/kvm.h
> > > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > > > 
> > > > > >  #define KVM_EXIT_NMI              16
> > > > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > > > >  #define KVM_EXIT_OSI              18
> > > > > > 
> > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > > > 
> > > > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > > > 
> > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > > > 
> > > > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > > > >  #define KVM_CAP_ASYNC_PF 59
> > > > > > 
> > > > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > > > 
> > > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > > 
> > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > > > 
> > > > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK            
> > > > > >  _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > > > 
> > > > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO 
> > > > > > _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user)
> > > > > > 
> > > > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO, 
> > > > > >  0xa0, struct kvm_pit_state2)
> > > > > > 
> > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > > > 
> > > > > >  	__u16 padding[3];
> > > > > >  
> > > > > >  };
> > > > > > 
> > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > +struct kvm_msix_mmio_user {
> > > > > > +	__u32 dev_id;
> > > > > > +	__u16 type;
> > > > > > +	__u16 max_entries_nr;
> > > > > > +	__u64 base_addr;
> > > > > > +	__u64 base_va;
> > > > > > +	__u64 flags;
> > > > > > +	__u64 reserved[4];
> > > > > > +};
> > > > > > +
> > > > > > 
> > > > > >  #endif /* __LINUX_KVM_H */
> > > > > > 
> > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > index 7d313e0..c10670c 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > > > 
> > > > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > > > >  
> > > > > >  };
> > > > > > 
> > > > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > > 
> > > > > I assume this is based on 32 devices/bus, but we could exceed this
> > > > > is we add more buses or make use of more functions.  How hard is
> > > > > it to make this dynamic?
> > > > 
> > > > We can improve this later(maybe with balance tree?). 32 should be
> > > > more than sufficient for now. And this patchset has been in the
> > > > mailing list for months, I really want to get it in first...
> > > > 
> > > > > > +
> > > > > > +struct kvm_msix_mmio {
> > > > > > +	u32 dev_id;
> > > > > > +	u16 type;
> > > > > > +	u16 max_entries_nr;
> > > > > > +	u64 flags;
> > > > > > +	gpa_t table_base_addr;
> > > > > > +	hva_t table_base_va;
> > > > > > +	gpa_t pba_base_addr;
> > > > > > +	hva_t pba_base_va;
> > > > > > +};
> > > > > > +
> > > > > > +struct kvm_msix_m4mio_dev {
> > > > > > +	struct kvm *kvm;
> > > > > > +	struct kvm_io_device table_dev;
> > > > > > +	int mmio_nr;
> > > > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > > > +	struct mutex lock;
> > > > > > +};
> > > > > > +
> > > > > > 
> > > > > >  struct kvm {
> > > > > >  
> > > > > >  	spinlock_t mmu_lock;
> > > > > >  	raw_spinlock_t requests_lock;
> > > > > > 
> > > > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > > > 
> > > > > >  	long mmu_notifier_count;
> > > > > >  
> > > > > >  #endif
> > > > > >  
> > > > > >  	long tlbs_dirty;
> > > > > > 
> > > > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > > > 
> > > > > >  };
> > > > > >  
> > > > > >  /* The guest did something we don't support. */
> > > > > > 
> > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct
> > > > > > kvm *kvm,
> > > > > > 
> > > > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > > > 
> > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > +			int assigned_dev_id, int entry, bool mask);
> > > > > > +
> > > > > > 
> > > > > >  /* For vcpu->arch.iommu_flags */
> > > > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > > > 
> > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > > > index ae72ae6..d1598a6 100644
> > > > > > --- a/virt/kvm/assigned-dev.c
> > > > > > +++ b/virt/kvm/assigned-dev.c
> > > > > > @@ -18,6 +18,7 @@
> > > > > > 
> > > > > >  #include <linux/interrupt.h>
> > > > > >  #include <linux/slab.h>
> > > > > >  #include "irq.h"
> > > > > > 
> > > > > > +#include "msix_mmio.h"
> > > > > > 
> > > > > >  static struct kvm_assigned_dev_kernel
> > > > > >  *kvm_find_assigned_dev(struct list_head *head,
> > > > > >  
> > > > > >  						      int assigned_dev_id)
> > > > > > 
> > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct
> > > > > > kvm *kvm,
> > > > > > 
> > > > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > > > >  	assigned_dev->irq_requested_type);
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio mmio;
> > > > > > +
> > > > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > > > +}
> > > > > > +
> > > > > > 
> > > > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > > > >  
> > > > > >  				     struct kvm_assigned_dev_kernel
> > > > > >  				     *assigned_dev)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > > > 
> > > > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > > > +
> > > > > > 
> > > > > >  	__pci_reset_function(assigned_dev->dev);
> > > > > >  	pci_restore_state(assigned_dev->dev);
> > > > > > 
> > > > > > @@ -785,3 +799,33 @@ out:
> > > > > >  	return r;
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > +/* The caller should hold kvm->lock */
> > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > +				int assigned_dev_id, int entry, bool mask)
> > > > > > +{
> > > > > > +	int r = -EFAULT;
> > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > +		return r;
> > > > > > +
> > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > +				      assigned_dev_id);
> > > > > > +	if (!adev)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > +			if (mask)
> > > > > > +				disable_irq_nosync(
> > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > +			else
> > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > +			r = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +out:
> > > > > > +	return r;
> > > > > > +}
> > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > > index b1b6cbb..b7807c8 100644
> > > > > > --- a/virt/kvm/kvm_main.c
> > > > > > +++ b/virt/kvm/kvm_main.c
> > > > > > @@ -56,6 +56,7 @@
> > > > > > 
> > > > > >  #include "coalesced_mmio.h"
> > > > > >  #include "async_pf.h"
> > > > > > 
> > > > > > +#include "msix_mmio.h"
> > > > > > 
> > > > > >  #define CREATE_TRACE_POINTS
> > > > > >  #include <trace/events/kvm.h>
> > > > > > 
> > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > > > 
> > > > > >  	struct mm_struct *mm = kvm->mm;
> > > > > >  	
> > > > > >  	kvm_arch_sync_events(kvm);
> > > > > > 
> > > > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > > > 
> > > > > >  	spin_lock(&kvm_lock);
> > > > > >  	list_del(&kvm->vm_list);
> > > > > >  	spin_unlock(&kvm_lock);
> > > > > > 
> > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file
> > > > > > *filp,
> > > > > > 
> > > > > >  		mutex_unlock(&kvm->lock);
> > > > > >  		break;
> > > > > >  
> > > > > >  #endif
> > > > > > 
> > > > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > > +
> > > > > > +		r = -EFAULT;
> > > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > > +			goto out;
> > > > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > > > +		break;
> > > > > > +	}
> > > > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > > +
> > > > > > +		r = -EFAULT;
> > > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > > +			goto out;
> > > > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > > > +		break;
> > > > > > +	}
> > > > > > 
> > > > > >  	default:
> > > > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > > > >  		if (r == -ENOTTY)
> > > > > > 
> > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > > 
> > > > > >  		return r;
> > > > > >  	
> > > > > >  	}
> > > > > >  
> > > > > >  #endif
> > > > > > 
> > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > +	if (r < 0) {
> > > > > > +		kvm_put_kvm(kvm);
> > > > > > +		return r;
> > > > > > +	}
> > > > > > +
> > > > > > 
> > > > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > > > >  	if (r < 0)
> > > > > >  	
> > > > > >  		kvm_put_kvm(kvm);
> > > > > > 
> > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > > > kvm_io_bus *bus)
> > > > > > 
> > > > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx,
> > > > > >  gpa_t addr,
> > > > > >  
> > > > > >  		     int len, const void *val)
> > > > > >  
> > > > > >  {
> > > > > > 
> > > > > > -	int i;
> > > > > > +	int i, r = -EOPNOTSUPP;
> > > > > > 
> > > > > >  	struct kvm_io_bus *bus;
> > > > > >  	
> > > > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > > > 
> > > > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > > > +		if (r == -ENOTSYNC)
> > > > > > +			break;
> > > > > > +		else if (!r)
> > > > > > 
> > > > > >  			return 0;
> > > 
> > > So the above will be
> > > 
> > >  if (r >= 0)
> > >  
> > > 	 return r;
> > > 	 
> > > > > > -	return -EOPNOTSUPP;
> > > > > > +	}
> > > > > > +	return r;
> > > > > > 
> > > > > >  }
> > > > > >  
> > > > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > > > 
> > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > > > new file mode 100644
> > > > > > index 0000000..dde9b62
> > > > > > --- /dev/null
> > > > > > +++ b/virt/kvm/msix_mmio.c
> > > > > > @@ -0,0 +1,286 @@
> > > > > > +/*
> > > > > > + * MSI-X MMIO emulation
> > > > > > + *
> > > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > > + *
> > > > > > + * This work is licensed under the terms of the GNU GPL, version
> > > > > > 2. See + * the COPYING file in the top-level directory.
> > > > > > + *
> > > > > > + * Author:
> > > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > > + */
> > > > > > +
> > > > > > +#include <linux/kvm_host.h>
> > > > > > +#include <linux/kvm.h>
> > > > > > +
> > > > > > +#include "msix_mmio.h"
> > > > > > +#include "iodev.h"
> > > > > > +
> > > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > > > +{
> > > > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > > > +				mmio->dev_id, entry, flag);
> > > > > > +	return -EFAULT;
> > > > > > +}
> > > > > > +
> > > > > > +/* Caller must hold dev->lock */
> > > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > > > +				gpa_t addr, int len)
> > > > > > +{
> > > > > > +	gpa_t start, end;
> > > > > > +	int i, r = -EINVAL;
> > > > > > +
> > > > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > > > +		start = dev->mmio[i].table_base_addr;
> > > > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > > > +			dev->mmio[i].max_entries_nr;
> > > > > > +		if (addr >= start && addr + len <= end) {
> > > > > > +			r = i;
> > > > > > +			break;
> > > > > > +		}
> > > > > 
> > > > > Yet another potential user of the weight balanced tree ;)
> > > > 
> > > > Yeah, we can improve it later.
> > > > 
> > > > > > +	}
> > > > > > +
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > gpa_t addr, int len, +				void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx < 0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	offset = addr & 0xf;
> > > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > > +		goto out;
> > > 
> > > This is not as strict as it could be. Also
> > > see below (in write) for an idea how to make the code clearer.
> > > 
> > > > > > +
> > > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > +	if (r)
> > > > > > +		goto out;
> > > 
> > > Want to give the caller some indication of an error?
> > > If not add a comment why.
> > 
> > I don't think we have a proper way to indicate the error now.
> 
> Then let's add a comment, and skip the goto - it's just confusing:
> 
> 	/* Ignore return value, we don't have a way to indicate the
> 	 * error. */
> 	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> 			entry * PCI_MSIX_ENTRY_SIZE + offset), len);

Um... I really think it's quite obvious in the code.
> 
> > Also the
> > table_base_va already passed the checking when MMIO registered. We can
> > add more error info later.
> > 
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > +	gpa_t entry_base;
> > > > > > +	u32 old_ctrl, new_ctrl;
> > > 
> > > These are really __le32. Pls make them so and then
> > > fix up the sparse errors.
> > > 
> > > > > > +	u32 *ctrl_pos;
> > > 
> > > get_user should not be done on u32 *, it should be
> > > __user.  Does not sparse complain about this?
> > 
> > Haven't use sparse to check the patches. Would do it later.
> 
> Please do. It's a lot of code with tricky pointer math, must make sure
> we are not accessing userspace pointers directly etc.

All following patches has been checked.
> 
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx < 0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	offset = addr & 0xF;
> > > > > 
> > > > > nit, this 0xf above.
> > > > 
> > > > So you like another mask?
> > > 
> > > Yes, this should be
> > > 
> > > 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > 	
> > > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > > +		goto out;
> > > 
> > > I think it's a good idea to be as strict as possible.
> > > Also let's quote the spec. Finally using % will make
> > > the logic more transparent. How about the below?
> > > 
> > > 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must
> > > 	use
> > > 	
> > > 	   aligned full DWORD or aligned full QWORD transactions; otherwise,
> > > 	   the result is undefined. */
> > > 	
> > > 	if (len == 4) {
> > > 	
> > > 		if (addr % 4)
> > > 		
> > > 			goto out;
> > > 			
> > >         } else if (len == 8) {
> > > 		
> > > 		if (addr % 8)
> > > 		
> > > 			goto out;
> > > 			
> > >         } else
> > > 		
> > > 		goto out;
> > > 
> > > As read needs same logic it's likely a good idea
> > > to add a function for this.
> > > 
> > > > > > +
> > > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > > +
> > > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > 
> > > It's probably a good idea to do access_ok check for the whole
> > > entry here, and then use __get_user etc below, these are cheaper.
> > 
> > table_base_va already passed access_ok checking. I am not sure if we need
> > to do it again.
> 
> Yes but the process invoking the ioctl can be different from the one
> invoking the mmio.

I don't understand why it matters here...

Oh, I found I missed your original point. You mean we can check all entry and call 
a cheaper __get_user instread of get_user. Well, we can leave this kind of tweak 
later...
> 
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > 
> > > Do not allow.
> > > 
> > > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val,
> > > > > > len))
> > > 
> > > any such cast is suspect.
> > > Is entry_base really gpa? If it was you can not cast it to __user.
> > 
> > HVA.
> 
> Then don't declare it gpa_t

Oh, typo... Would update it later.
> 
> > > I think it should be void __user *.
> > > 
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > > +		goto out;
> > > 
> > > get_user assumes aligned address. But there's no guarantee ctrl_pos
> > > is aligned because table_base_va might be misaligned.
> > 
> > table_base_va already passed access_ok() check.
> 
> access_ok done on table_base_va only checks that the address
> is not a kernel one.

Now table_base_va's alignment has been checked(see the latest patchset), so I 
think we're fine here.

--
regards
Yang, Sheng

> 
> > > > > > +
> > > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > > +		ret = -ENOTSYNC;
> > > 
> > > Better to use != and not <, effectively same.
> > > 
> > > Or in the above we can do:
> > > 	bool control;
> > > 	if (len == 4) {
> > > 	
> > > 		if (addr % 4)
> > > 		
> > > 			goto out;
> > > 		
> > > 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
> > > 		
> > >         } else if (len == 8) {
> > > 		
> > > 		if (addr % 8)
> > > 		
> > > 			goto out;
> > > 		
> > > 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
> > > 		
> > >         } else
> > > 		
> > > 		goto out;
> > > 		
> > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested
> > > > > this first?  I'd also prefer to see a direct goto out here since
> > > > > it's not entirely obvious that this first 'if' always falls into
> > > > > the below 'if'. I'm not a fan of using an error code as a special
> > > > > non-error return either.
> > > > 
> > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to
> > > > check new_ctrl to see if they indeed modified ctrl bits.
> > > > 
> > > > I would try to make the logic here more clear.
> > > > 
> > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > 
> > > > > An xor would remove some redundancy here
> > > > > 
> > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r =
> > > > > update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl &
> > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > > 
> > > > Oh, yes, sure...
> > > 
> > > Also why not set ret to correct value directly?
> > > 
> > > > > > +	if (r || ret)
> > > > > > +		ret = -ENOTSYNC;
> > > 
> > > when is ret set? And why not to the correct value directly?
> > > 
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > > +	.read     = msix_table_mmio_read,
> > > > > > +	.write    = msix_table_mmio_write,
> > > > > > +};
> > > > > > +
> > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > +	int r = 0, i;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) 
==
> > > > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > +			mmio = &mmio_dev->mmio[i];
> > > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) 
{
> > > > > > +				r = -EINVAL;
> > > > > > +				goto out;
> > > > > > +			}
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	/* All reserved currently */
> > > > > > +	if (mmio_user->flags) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if (!mmio) {
> > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > +			r = -ENOSPC;
> > > > > > +			goto out;
> > > > > > +		}
> > > > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > +	}
> > > > > > +
> > > > > > +	mmio_dev->mmio_nr++;
> > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > +
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > 
> > > Also check alignment for these values. base_va should be
> > > at least 4 byte aligned. Maybe more, need to think about it ...
> > > Look in spec for table_base_addr alignment.
> > 
> > OK.
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > > > +	}
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > > *mmio) +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > > +	int r = 0, i, j;
> > > > > 
> > > > > set r = -EINVAL here
> > > > > 
> > > > > > +	bool found = 0;
> > > > > > +
> > > > > > +	if (!mmio)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > > > +			found = true;
> > > > > 
> > > > > set r = 0 here
> > > > > 
> > > > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > > > +			mmio_dev->mmio_nr--;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	if (!found)
> > > > > > +		r = -EINVAL;
> > > > > 
> > > > > then get rid of found
> > > > 
> > > > Sure.
> > > > 
> > > > --
> > > > regards
> > > > Yang, Sheng
> > > > 
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio mmio;
> > > > > > +
> > > > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > > > +	mmio.type = mmio_user->type;
> > > > > > +
> > > > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > > > +}
> > > > > > +
> > > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > > > new file mode 100644
> > > > > > index 0000000..01b6587
> > > > > > --- /dev/null
> > > > > > +++ b/virt/kvm/msix_mmio.h
> > > > > > @@ -0,0 +1,25 @@
> > > > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > > > +#define __KVM_MSIX_MMIO_H__
> > > > > > +/*
> > > > > > + * MSI-X MMIO emulation
> > > > > > + *
> > > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > > + *
> > > > > > + * This work is licensed under the terms of the GNU GPL, version
> > > > > > 2. See + * the COPYING file in the top-level directory.
> > > > > > + *
> > > > > > + * Author:
> > > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > > + */
> > > > > > +
> > > > > > +#include <linux/pci.h>
> > > > > > +
> > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > > *mmio_user); +
> > > > > > +#endif

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24  9:44         ` Sheng Yang
@ 2011-02-24 10:17           ` Michael S. Tsirkin
  2011-02-25  6:12             ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:17 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > just need to be able to communicate disabling and enabling of
> > > > individual msix vectors.  For brute force, we could do this with a lot
> > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > > vector.  Not sure if there's something smaller that could do it. 
> > > > Thanks,
> > > 
> > > Alex, thanks for your comments. See my comments below:
> > > > Alex
> > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > ---
> > > > > 
> > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > >  include/linux/kvm.h      |   21 ++++
> > > > >  include/linux/kvm_host.h |   25 ++++
> > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > >  virt/kvm/msix_mmio.c     |  286
> > > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > > >  
> > > > >  |   25 ++++
> > > > >  
> > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > 
> > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > index f15501f..3a0d851 100644
> > > > > --- a/arch/x86/kvm/Makefile
> > > > > +++ b/arch/x86/kvm/Makefile
> > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > 
> > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
> \
> > > > >  
> > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > 
> > > > > -				assigned-dev.o)
> > > > > +				assigned-dev.o msix_mmio.o)
> > > > > 
> > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > > >  async_pf.o)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index fa708c9..89bf12c 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > 
> > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > >  	case KVM_CAP_XSAVE:
> > > > > 
> > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  					   struct kvm_vcpu *vcpu)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	gpa_t                 gpa;
> > > > > 
> > > > > +	int r;
> > > > > 
> > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > 
> > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  mmio:
> > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > 
> > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > 
> > > > >  	/*
> > > > >  	
> > > > >  	 * Is this MMIO handled locally?
> > > > >  	 */
> > > > > 
> > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > +	if (!r)
> > > > > 
> > > > >  		return X86EMUL_CONTINUE;
> > > > >  	
> > > > >  	vcpu->mmio_needed = 1;
> > > > > 
> > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> > This use of -ENOTSYNC is IMO confusing.
> > How about we make vcpu_mmio_write return the positive exit reason?
> > Negative value will mean an error.
> 
> In fact currently nagative value means something more need to be done, the same as 
> MMIO exit.


So it would be
	if (!r)
  		return X86EMUL_CONTINUE;

	vcpu->run->exit_reason = r;

> Now I think we can keep it, or update them all later.

The way to do this would be
1. patch to return KVM_EXIT_MMIO on mmio
2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

> 
> --
> regards
> Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24  8:08         ` Sheng Yang
@ 2011-02-24 10:11           ` Michael S. Tsirkin
  2011-02-25  5:54             ` Sheng Yang
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:11 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > just need to be able to communicate disabling and enabling of
> > > > individual msix vectors.  For brute force, we could do this with a lot
> > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > > vector.  Not sure if there's something smaller that could do it. 
> > > > Thanks,
> > > 
> > > Alex, thanks for your comments. See my comments below:
> > > > Alex
> > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > ---
> > > > > 
> > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > >  include/linux/kvm.h      |   21 ++++
> > > > >  include/linux/kvm_host.h |   25 ++++
> > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > >  virt/kvm/msix_mmio.c     |  286
> > > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > > >  
> > > > >  |   25 ++++
> > > > >  
> > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > 
> > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > index f15501f..3a0d851 100644
> > > > > --- a/arch/x86/kvm/Makefile
> > > > > +++ b/arch/x86/kvm/Makefile
> > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > 
> > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
> \
> > > > >  
> > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > 
> > > > > -				assigned-dev.o)
> > > > > +				assigned-dev.o msix_mmio.o)
> > > > > 
> > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > > >  async_pf.o)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index fa708c9..89bf12c 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > 
> > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > >  	case KVM_CAP_XSAVE:
> > > > > 
> > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  					   struct kvm_vcpu *vcpu)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	gpa_t                 gpa;
> > > > > 
> > > > > +	int r;
> > > > > 
> > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > 
> > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  mmio:
> > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > 
> > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > 
> > > > >  	/*
> > > > >  	
> > > > >  	 * Is this MMIO handled locally?
> > > > >  	 */
> > > > > 
> > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > +	if (!r)
> > > > > 
> > > > >  		return X86EMUL_CONTINUE;
> > > > >  	
> > > > >  	vcpu->mmio_needed = 1;
> > > > > 
> > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> > This use of -ENOTSYNC is IMO confusing.
> > How about we make vcpu_mmio_write return the positive exit reason?
> > Negative value will mean an error.
> 
> Make sense. I would update it.
> > 
> > > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > > 
> > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > index ea2dc1a..ad9df4b 100644
> > > > > --- a/include/linux/kvm.h
> > > > > +++ b/include/linux/kvm.h
> > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > > 
> > > > >  #define KVM_EXIT_NMI              16
> > > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > > >  #define KVM_EXIT_OSI              18
> > > > > 
> > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > > 
> > > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > > 
> > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > > 
> > > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > > >  #define KVM_CAP_ASYNC_PF 59
> > > > > 
> > > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > > 
> > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > 
> > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > > 
> > > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > > 
> > > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,
> > > > > 0x7e, struct kvm_msix_mmio_user)
> > > > > 
> > > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > > >  struct kvm_pit_state2)
> > > > > 
> > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > > 
> > > > >  	__u16 padding[3];
> > > > >  
> > > > >  };
> > > > > 
> > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > +struct kvm_msix_mmio_user {
> > > > > +	__u32 dev_id;
> > > > > +	__u16 type;
> > > > > +	__u16 max_entries_nr;
> > > > > +	__u64 base_addr;
> > > > > +	__u64 base_va;
> > > > > +	__u64 flags;
> > > > > +	__u64 reserved[4];
> > > > > +};
> > > > > +
> > > > > 
> > > > >  #endif /* __LINUX_KVM_H */
> > > > > 
> > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > index 7d313e0..c10670c 100644
> > > > > --- a/include/linux/kvm_host.h
> > > > > +++ b/include/linux/kvm_host.h
> > > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > > 
> > > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > > >  
> > > > >  };
> > > > > 
> > > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > 
> > > > I assume this is based on 32 devices/bus, but we could exceed this is
> > > > we add more buses or make use of more functions.  How hard is it to
> > > > make this dynamic?
> > > 
> > > We can improve this later(maybe with balance tree?). 32 should be more
> > > than sufficient for now. And this patchset has been in the mailing list
> > > for months, I really want to get it in first...
> > > 
> > > > > +
> > > > > +struct kvm_msix_mmio {
> > > > > +	u32 dev_id;
> > > > > +	u16 type;
> > > > > +	u16 max_entries_nr;
> > > > > +	u64 flags;
> > > > > +	gpa_t table_base_addr;
> > > > > +	hva_t table_base_va;
> > > > > +	gpa_t pba_base_addr;
> > > > > +	hva_t pba_base_va;
> > > > > +};
> > > > > +
> > > > > +struct kvm_msix_m4mio_dev {
> > > > > +	struct kvm *kvm;
> > > > > +	struct kvm_io_device table_dev;
> > > > > +	int mmio_nr;
> > > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > > +	struct mutex lock;
> > > > > +};
> > > > > +
> > > > > 
> > > > >  struct kvm {
> > > > >  
> > > > >  	spinlock_t mmu_lock;
> > > > >  	raw_spinlock_t requests_lock;
> > > > > 
> > > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > > 
> > > > >  	long mmu_notifier_count;
> > > > >  
> > > > >  #endif
> > > > >  
> > > > >  	long tlbs_dirty;
> > > > > 
> > > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > > 
> > > > >  };
> > > > >  
> > > > >  /* The guest did something we don't support. */
> > > > > 
> > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > > 
> > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > +			int assigned_dev_id, int entry, bool mask);
> > > > > +
> > > > > 
> > > > >  /* For vcpu->arch.iommu_flags */
> > > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > > 
> > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > > index ae72ae6..d1598a6 100644
> > > > > --- a/virt/kvm/assigned-dev.c
> > > > > +++ b/virt/kvm/assigned-dev.c
> > > > > @@ -18,6 +18,7 @@
> > > > > 
> > > > >  #include <linux/interrupt.h>
> > > > >  #include <linux/slab.h>
> > > > >  #include "irq.h"
> > > > > 
> > > > > +#include "msix_mmio.h"
> > > > > 
> > > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > > >  list_head *head,
> > > > >  
> > > > >  						      int assigned_dev_id)
> > > > > 
> > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > > >  	assigned_dev->irq_requested_type);
> > > > >  
> > > > >  }
> > > > > 
> > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > > +{
> > > > > +	struct kvm_msix_mmio mmio;
> > > > > +
> > > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > > +}
> > > > > +
> > > > > 
> > > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > > >  
> > > > >  				     struct kvm_assigned_dev_kernel
> > > > >  				     *assigned_dev)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > > 
> > > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > > +
> > > > > 
> > > > >  	__pci_reset_function(assigned_dev->dev);
> > > > >  	pci_restore_state(assigned_dev->dev);
> > > > > 
> > > > > @@ -785,3 +799,33 @@ out:
> > > > >  	return r;
> > > > >  
> > > > >  }
> > > > > 
> > > > > +/* The caller should hold kvm->lock */
> > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > +				int assigned_dev_id, int entry, bool mask)
> > > > > +{
> > > > > +	int r = -EFAULT;
> > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > +	int i;
> > > > > +
> > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > +		return r;
> > > > > +
> > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > +				      assigned_dev_id);
> > > > > +	if (!adev)
> > > > > +		goto out;
> > > > > +
> > > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > +			if (mask)
> > > > > +				disable_irq_nosync(
> > > > > +					adev->host_msix_entries[i].vector);
> > > > > +			else
> > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > +			r = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +out:
> > > > > +	return r;
> > > > > +}
> > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > index b1b6cbb..b7807c8 100644
> > > > > --- a/virt/kvm/kvm_main.c
> > > > > +++ b/virt/kvm/kvm_main.c
> > > > > @@ -56,6 +56,7 @@
> > > > > 
> > > > >  #include "coalesced_mmio.h"
> > > > >  #include "async_pf.h"
> > > > > 
> > > > > +#include "msix_mmio.h"
> > > > > 
> > > > >  #define CREATE_TRACE_POINTS
> > > > >  #include <trace/events/kvm.h>
> > > > > 
> > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > > 
> > > > >  	struct mm_struct *mm = kvm->mm;
> > > > >  	
> > > > >  	kvm_arch_sync_events(kvm);
> > > > > 
> > > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > > 
> > > > >  	spin_lock(&kvm_lock);
> > > > >  	list_del(&kvm->vm_list);
> > > > >  	spin_unlock(&kvm_lock);
> > > > > 
> > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > > 
> > > > >  		mutex_unlock(&kvm->lock);
> > > > >  		break;
> > > > >  
> > > > >  #endif
> > > > > 
> > > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > +
> > > > > +		r = -EFAULT;
> > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > +			goto out;
> > > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > > +		break;
> > > > > +	}
> > > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > +
> > > > > +		r = -EFAULT;
> > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > +			goto out;
> > > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > > +		break;
> > > > > +	}
> > > > > 
> > > > >  	default:
> > > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > > >  		if (r == -ENOTTY)
> > > > > 
> > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > 
> > > > >  		return r;
> > > > >  	
> > > > >  	}
> > > > >  
> > > > >  #endif
> > > > > 
> > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > +	if (r < 0) {
> > > > > +		kvm_put_kvm(kvm);
> > > > > +		return r;
> > > > > +	}
> > > > > +
> > > > > 
> > > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > > >  	if (r < 0)
> > > > >  	
> > > > >  		kvm_put_kvm(kvm);
> > > > > 
> > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > > kvm_io_bus *bus)
> > > > > 
> > > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > > >  addr,
> > > > >  
> > > > >  		     int len, const void *val)
> > > > >  
> > > > >  {
> > > > > 
> > > > > -	int i;
> > > > > +	int i, r = -EOPNOTSUPP;
> > > > > 
> > > > >  	struct kvm_io_bus *bus;
> > > > >  	
> > > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > > 
> > > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > > +		if (r == -ENOTSYNC)
> > > > > +			break;
> > > > > +		else if (!r)
> > > > > 
> > > > >  			return 0;
> > 
> > So the above will be
> >  if (r >= 0)
> > 	 return r;
> > 
> > > > > -	return -EOPNOTSUPP;
> > > > > +	}
> > > > > +	return r;
> > > > > 
> > > > >  }
> > > > >  
> > > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > > 
> > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > > new file mode 100644
> > > > > index 0000000..dde9b62
> > > > > --- /dev/null
> > > > > +++ b/virt/kvm/msix_mmio.c
> > > > > @@ -0,0 +1,286 @@
> > > > > +/*
> > > > > + * MSI-X MMIO emulation
> > > > > + *
> > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > > See + * the COPYING file in the top-level directory.
> > > > > + *
> > > > > + * Author:
> > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > + */
> > > > > +
> > > > > +#include <linux/kvm_host.h>
> > > > > +#include <linux/kvm.h>
> > > > > +
> > > > > +#include "msix_mmio.h"
> > > > > +#include "iodev.h"
> > > > > +
> > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > > +{
> > > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > > +				mmio->dev_id, entry, flag);
> > > > > +	return -EFAULT;
> > > > > +}
> > > > > +
> > > > > +/* Caller must hold dev->lock */
> > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > > +				gpa_t addr, int len)
> > > > > +{
> > > > > +	gpa_t start, end;
> > > > > +	int i, r = -EINVAL;
> > > > > +
> > > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > > +		start = dev->mmio[i].table_base_addr;
> > > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > > +			dev->mmio[i].max_entries_nr;
> > > > > +		if (addr >= start && addr + len <= end) {
> > > > > +			r = i;
> > > > > +			break;
> > > > > +		}
> > > > 
> > > > Yet another potential user of the weight balanced tree ;)
> > > 
> > > Yeah, we can improve it later.
> > > 
> > > > > +	}
> > > > > +
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > > addr, int len, +				void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xf;
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > 
> > This is not as strict as it could be. Also
> > see below (in write) for an idea how to make the code clearer.
> > 
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > +	if (r)
> > > > > +		goto out;
> > 
> > Want to give the caller some indication of an error?
> > If not add a comment why.
> 
> I don't think we have a proper way to indicate the error now.

Then let's add a comment, and skip the goto - it's just confusing:

	/* Ignore return value, we don't have a way to indicate the
	 * error. */
	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
	


> Also the 
> table_base_va already passed the checking when MMIO registered. We can add more 
> error info later.
> > 
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > > addr, +				int len, const void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > +	gpa_t entry_base;
> > > > > +	u32 old_ctrl, new_ctrl;
> > 
> > These are really __le32. Pls make them so and then
> > fix up the sparse errors.
> > 
> > > > > +	u32 *ctrl_pos;
> > 
> > get_user should not be done on u32 *, it should be
> > __user.  Does not sparse complain about this?
> > 
> 
> Haven't use sparse to check the patches. Would do it later.

Please do. It's a lot of code with tricky pointer math, must make sure
we are not accessing userspace pointers directly etc.

> > > > > +
> > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xF;
> > > > 
> > > > nit, this 0xf above.
> > > 
> > > So you like another mask?
> > 
> > Yes, this should be
> > 
> > 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > 
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > 
> > I think it's a good idea to be as strict as possible.
> > Also let's quote the spec. Finally using % will make
> > the logic more transparent. How about the below?
> > 
> > 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > 	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
> > 	   result is undefined. */
> > 	if (len == 4) {
> > 		if (addr % 4)
> > 			goto out;
> >         } else if (len == 8) {
> > 		if (addr % 8)
> > 			goto out;
> >         } else
> > 		goto out;
> > 
> > As read needs same logic it's likely a good idea
> > to add a function for this.
> > 
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > +
> > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > 
> > It's probably a good idea to do access_ok check for the whole
> > entry here, and then use __get_user etc below, these are cheaper.
> 
> table_base_va already passed access_ok checking. I am not sure if we need to do it 
> again.

Yes but the process invoking the ioctl can be different from the one
invoking the mmio.

> > 
> > > > > +		goto out;
> > > > > +
> > > > > +	/* No allow writing to other fields when entry is unmasked */
> > 
> > Do not allow.
> > 
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > 
> > any such cast is suspect.
> > Is entry_base really gpa? If it was you can not cast it to __user.
> 
> HVA.

Then don't declare it gpa_t

> > I think it should be void __user *.
> > 
> > > > > +		goto out;
> > > > > +
> > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > +		goto out;
> > 
> > get_user assumes aligned address. But there's no guarantee ctrl_pos
> > is aligned because table_base_va might be misaligned.
> 
> table_base_va already passed access_ok() check.

access_ok done on table_base_va only checks that the address
is not a kernel one.

> > 
> > > > > +
> > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > +		ret = -ENOTSYNC;
> > 
> > Better to use != and not <, effectively same.
> > Or in the above we can do:
> > 	bool control;
> > 	if (len == 4) {
> > 		if (addr % 4)
> > 			goto out;
> > 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
> >         } else if (len == 8) {
> > 		if (addr % 8)
> > 			goto out;
> > 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
> >         } else
> > 		goto out;
> > 
> > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > > first?  I'd also prefer to see a direct goto out here since it's not
> > > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > > I'm not a fan of using an error code as a special non-error return
> > > > either.
> > > 
> > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check
> > > new_ctrl to see if they indeed modified ctrl bits.
> > > 
> > > I would try to make the logic here more clear.
> > > 
> > > > > +	if (old_ctrl == new_ctrl)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > 
> > > > An xor would remove some redundancy here
> > > > 
> > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > 
> > > Oh, yes, sure...
> > 
> > Also why not set ret to correct value directly?
> > 
> > > > > +	if (r || ret)
> > > > > +		ret = -ENOTSYNC;
> > 
> > when is ret set? And why not to the correct value directly?
> > 
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > +	.read     = msix_table_mmio_read,
> > > > > +	.write    = msix_table_mmio_write,
> > > > > +};
> > > > > +
> > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > > +{
> > > > > +	int ret;
> > > > > +
> > > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > > +{
> > > > > +	int ret;
> > > > > +
> > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > +	int r = 0, i;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > +			mmio = &mmio_dev->mmio[i];
> > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > > +				r = -EINVAL;
> > > > > +				goto out;
> > > > > +			}
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	/* All reserved currently */
> > > > > +	if (mmio_user->flags) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (!mmio) {
> > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > +			r = -ENOSPC;
> > > > > +			goto out;
> > > > > +		}
> > > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > +	}
> > > > > +
> > > > > +	mmio_dev->mmio_nr++;
> > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > +	mmio->flags = mmio_user->flags;
> > > > > +
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > +		mmio->table_base_va = mmio_user->base_va;
> > 
> > Also check alignment for these values. base_va should be
> > at least 4 byte aligned. Maybe more, need to think about it ...
> > Look in spec for table_base_addr alignment.
> 
> OK.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > > > +	}
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > +	int r = 0, i, j;
> > > > 
> > > > set r = -EINVAL here
> > > > 
> > > > > +	bool found = 0;
> > > > > +
> > > > > +	if (!mmio)
> > > > > +		return 0;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > > +			found = true;
> > > > 
> > > > set r = 0 here
> > > > 
> > > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > > +			mmio_dev->mmio_nr--;
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	if (!found)
> > > > > +		r = -EINVAL;
> > > > 
> > > > then get rid of found
> > > 
> > > Sure.
> > > 
> > > --
> > > regards
> > > Yang, Sheng
> > > 
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > > +{
> > > > > +	struct kvm_msix_mmio mmio;
> > > > > +
> > > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > > +	mmio.type = mmio_user->type;
> > > > > +
> > > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > > +}
> > > > > +
> > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > > new file mode 100644
> > > > > index 0000000..01b6587
> > > > > --- /dev/null
> > > > > +++ b/virt/kvm/msix_mmio.h
> > > > > @@ -0,0 +1,25 @@
> > > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > > +#define __KVM_MSIX_MMIO_H__
> > > > > +/*
> > > > > + * MSI-X MMIO emulation
> > > > > + *
> > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > > See + * the COPYING file in the top-level directory.
> > > > > + *
> > > > > + * Author:
> > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > + */
> > > > > +
> > > > > +#include <linux/pci.h>
> > > > > +
> > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > *mmio_user); +
> > > > > +#endif

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-24  8:08         ` Sheng Yang
@ 2011-02-24  9:44         ` Sheng Yang
  2011-02-24 10:17           ` Michael S. Tsirkin
  1 sibling, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-02-24  9:44 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Looks pretty good overall.  A few comments below.  It seems like we
> > > should be able to hook this into vfio with a small stub in kvm.  We
> > > just need to be able to communicate disabling and enabling of
> > > individual msix vectors.  For brute force, we could do this with a lot
> > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > vector.  Not sure if there's something smaller that could do it. 
> > > Thanks,
> > 
> > Alex, thanks for your comments. See my comments below:
> > > Alex
> > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > ---
> > > > 
> > > >  arch/x86/kvm/Makefile    |    2 +-
> > > >  arch/x86/kvm/x86.c       |    8 +-
> > > >  include/linux/kvm.h      |   21 ++++
> > > >  include/linux/kvm_host.h |   25 ++++
> > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > >  virt/kvm/msix_mmio.c     |  286
> > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > >  
> > > >  |   25 ++++
> > > >  
> > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
\
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index fa708c9..89bf12c 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3807,6 +3808,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  					   struct kvm_vcpu *vcpu)
> > > >  
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3822,14 +3824,16 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > >  	
> > > >  	vcpu->mmio_needed = 1;
> > > > 
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> 
> This use of -ENOTSYNC is IMO confusing.
> How about we make vcpu_mmio_write return the positive exit reason?
> Negative value will mean an error.

In fact currently nagative value means something more need to be done, the same as 
MMIO exit. Now I think we can keep it, or update them all later.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  8:45       ` Michael S. Tsirkin
@ 2011-02-24  8:08         ` Sheng Yang
  2011-02-24 10:11           ` Michael S. Tsirkin
  2011-02-24  9:44         ` Sheng Yang
  1 sibling, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-02-24  8:08 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Looks pretty good overall.  A few comments below.  It seems like we
> > > should be able to hook this into vfio with a small stub in kvm.  We
> > > just need to be able to communicate disabling and enabling of
> > > individual msix vectors.  For brute force, we could do this with a lot
> > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > vector.  Not sure if there's something smaller that could do it. 
> > > Thanks,
> > 
> > Alex, thanks for your comments. See my comments below:
> > > Alex
> > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > ---
> > > > 
> > > >  arch/x86/kvm/Makefile    |    2 +-
> > > >  arch/x86/kvm/x86.c       |    8 +-
> > > >  include/linux/kvm.h      |   21 ++++
> > > >  include/linux/kvm_host.h |   25 ++++
> > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > >  virt/kvm/msix_mmio.c     |  286
> > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > >  
> > > >  |   25 ++++
> > > >  
> > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
\
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index fa708c9..89bf12c 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3807,6 +3808,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  					   struct kvm_vcpu *vcpu)
> > > >  
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3822,14 +3824,16 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > >  	
> > > >  	vcpu->mmio_needed = 1;
> > > > 
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> 
> This use of -ENOTSYNC is IMO confusing.
> How about we make vcpu_mmio_write return the positive exit reason?
> Negative value will mean an error.

Make sense. I would update it.
> 
> > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > 
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index ea2dc1a..ad9df4b 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > 
> > > >  #define KVM_EXIT_NMI              16
> > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > >  #define KVM_EXIT_OSI              18
> > > > 
> > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > 
> > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > 
> > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > 
> > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > >  #define KVM_CAP_ASYNC_PF 59
> > > > 
> > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > 
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > 
> > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > 
> > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > 
> > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,
> > > > 0x7e, struct kvm_msix_mmio_user)
> > > > 
> > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > >  struct kvm_pit_state2)
> > > > 
> > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > 
> > > >  	__u16 padding[3];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > +struct kvm_msix_mmio_user {
> > > > +	__u32 dev_id;
> > > > +	__u16 type;
> > > > +	__u16 max_entries_nr;
> > > > +	__u64 base_addr;
> > > > +	__u64 base_va;
> > > > +	__u64 flags;
> > > > +	__u64 reserved[4];
> > > > +};
> > > > +
> > > > 
> > > >  #endif /* __LINUX_KVM_H */
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 7d313e0..c10670c 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > 
> > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_MAX    32
> > > 
> > > I assume this is based on 32 devices/bus, but we could exceed this is
> > > we add more buses or make use of more functions.  How hard is it to
> > > make this dynamic?
> > 
> > We can improve this later(maybe with balance tree?). 32 should be more
> > than sufficient for now. And this patchset has been in the mailing list
> > for months, I really want to get it in first...
> > 
> > > > +
> > > > +struct kvm_msix_mmio {
> > > > +	u32 dev_id;
> > > > +	u16 type;
> > > > +	u16 max_entries_nr;
> > > > +	u64 flags;
> > > > +	gpa_t table_base_addr;
> > > > +	hva_t table_base_va;
> > > > +	gpa_t pba_base_addr;
> > > > +	hva_t pba_base_va;
> > > > +};
> > > > +
> > > > +struct kvm_msix_m4mio_dev {
> > > > +	struct kvm *kvm;
> > > > +	struct kvm_io_device table_dev;
> > > > +	int mmio_nr;
> > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > +	struct mutex lock;
> > > > +};
> > > > +
> > > > 
> > > >  struct kvm {
> > > >  
> > > >  	spinlock_t mmu_lock;
> > > >  	raw_spinlock_t requests_lock;
> > > > 
> > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > 
> > > >  	long mmu_notifier_count;
> > > >  
> > > >  #endif
> > > >  
> > > >  	long tlbs_dirty;
> > > > 
> > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > 
> > > >  };
> > > >  
> > > >  /* The guest did something we don't support. */
> > > > 
> > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm
> > > > *kvm,
> > > > 
> > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > 
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +			int assigned_dev_id, int entry, bool mask);
> > > > +
> > > > 
> > > >  /* For vcpu->arch.iommu_flags */
> > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > 
> > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > index ae72ae6..d1598a6 100644
> > > > --- a/virt/kvm/assigned-dev.c
> > > > +++ b/virt/kvm/assigned-dev.c
> > > > @@ -18,6 +18,7 @@
> > > > 
> > > >  #include <linux/interrupt.h>
> > > >  #include <linux/slab.h>
> > > >  #include "irq.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > >  list_head *head,
> > > >  
> > > >  						      int assigned_dev_id)
> > > > 
> > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm
> > > > *kvm,
> > > > 
> > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > >  	assigned_dev->irq_requested_type);
> > > >  
> > > >  }
> > > > 
> > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > 
> > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > >  
> > > >  				     struct kvm_assigned_dev_kernel
> > > >  				     *assigned_dev)
> > > >  
> > > >  {
> > > >  
> > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > 
> > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > +
> > > > 
> > > >  	__pci_reset_function(assigned_dev->dev);
> > > >  	pci_restore_state(assigned_dev->dev);
> > > > 
> > > > @@ -785,3 +799,33 @@ out:
> > > >  	return r;
> > > >  
> > > >  }
> > > > 
> > > > +/* The caller should hold kvm->lock */
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +				int assigned_dev_id, int entry, bool mask)
> > > > +{
> > > > +	int r = -EFAULT;
> > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > +	int i;
> > > > +
> > > > +	if (!irqchip_in_kernel(kvm))
> > > > +		return r;
> > > > +
> > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > +				      assigned_dev_id);
> > > > +	if (!adev)
> > > > +		goto out;
> > > > +
> > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > +			if (mask)
> > > > +				disable_irq_nosync(
> > > > +					adev->host_msix_entries[i].vector);
> > > > +			else
> > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > +			r = 0;
> > > > +			break;
> > > > +		}
> > > > +out:
> > > > +	return r;
> > > > +}
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index b1b6cbb..b7807c8 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -56,6 +56,7 @@
> > > > 
> > > >  #include "coalesced_mmio.h"
> > > >  #include "async_pf.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  #define CREATE_TRACE_POINTS
> > > >  #include <trace/events/kvm.h>
> > > > 
> > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > 
> > > >  	struct mm_struct *mm = kvm->mm;
> > > >  	
> > > >  	kvm_arch_sync_events(kvm);
> > > > 
> > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > 
> > > >  	spin_lock(&kvm_lock);
> > > >  	list_del(&kvm->vm_list);
> > > >  	spin_unlock(&kvm_lock);
> > > > 
> > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > 
> > > >  		mutex_unlock(&kvm->lock);
> > > >  		break;
> > > >  
> > > >  #endif
> > > > 
> > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > 
> > > >  	default:
> > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > >  		if (r == -ENOTTY)
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >  		return r;
> > > >  	
> > > >  	}
> > > >  
> > > >  #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r < 0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > > +
> > > > 
> > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > >  	if (r < 0)
> > > >  	
> > > >  		kvm_put_kvm(kvm);
> > > > 
> > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > kvm_io_bus *bus)
> > > > 
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr,
> > > >  
> > > >  		     int len, const void *val)
> > > >  
> > > >  {
> > > > 
> > > > -	int i;
> > > > +	int i, r = -EOPNOTSUPP;
> > > > 
> > > >  	struct kvm_io_bus *bus;
> > > >  	
> > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > 
> > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > +		if (r == -ENOTSYNC)
> > > > +			break;
> > > > +		else if (!r)
> > > > 
> > > >  			return 0;
> 
> So the above will be
>  if (r >= 0)
> 	 return r;
> 
> > > > -	return -EOPNOTSUPP;
> > > > +	}
> > > > +	return r;
> > > > 
> > > >  }
> > > >  
> > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > 
> > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > new file mode 100644
> > > > index 0000000..dde9b62
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.c
> > > > @@ -0,0 +1,286 @@
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/kvm.h>
> > > > +
> > > > +#include "msix_mmio.h"
> > > > +#include "iodev.h"
> > > > +
> > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > +{
> > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > +				mmio->dev_id, entry, flag);
> > > > +	return -EFAULT;
> > > > +}
> > > > +
> > > > +/* Caller must hold dev->lock */
> > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > +				gpa_t addr, int len)
> > > > +{
> > > > +	gpa_t start, end;
> > > > +	int i, r = -EINVAL;
> > > > +
> > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > +		start = dev->mmio[i].table_base_addr;
> > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > +			dev->mmio[i].max_entries_nr;
> > > > +		if (addr >= start && addr + len <= end) {
> > > > +			r = i;
> > > > +			break;
> > > > +		}
> > > 
> > > Yet another potential user of the weight balanced tree ;)
> > 
> > Yeah, we can improve it later.
> > 
> > > > +	}
> > > > +
> > > > +	return r;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > addr, int len, +				void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xf;
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> 
> This is not as strict as it could be. Also
> see below (in write) for an idea how to make the code clearer.
> 
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> 
> Want to give the caller some indication of an error?
> If not add a comment why.

I don't think we have a proper way to indicate the error now. Also the 
table_base_va already passed the checking when MMIO registered. We can add more 
error info later.
> 
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > addr, +				int len, const void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> 
> These are really __le32. Pls make them so and then
> fix up the sparse errors.
> 
> > > > +	u32 *ctrl_pos;
> 
> get_user should not be done on u32 *, it should be
> __user.  Does not sparse complain about this?
> 

Haven't use sparse to check the patches. Would do it later.
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xF;
> > > 
> > > nit, this 0xf above.
> > 
> > So you like another mask?
> 
> Yes, this should be
> 
> 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> 
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> 
> I think it's a good idea to be as strict as possible.
> Also let's quote the spec. Finally using % will make
> the logic more transparent. How about the below?
> 
> 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> 	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
> 	   result is undefined. */
> 	if (len == 4) {
> 		if (addr % 4)
> 			goto out;
>         } else if (len == 8) {
> 		if (addr % 8)
> 			goto out;
>         } else
> 		goto out;
> 
> As read needs same logic it's likely a good idea
> to add a function for this.
> 
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> 
> It's probably a good idea to do access_ok check for the whole
> entry here, and then use __get_user etc below, these are cheaper.

table_base_va already passed access_ok checking. I am not sure if we need to do it 
again.
> 
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> 
> Do not allow.
> 
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> 
> any such cast is suspect.
> Is entry_base really gpa? If it was you can not cast it to __user.

HVA.

> I think it should be void __user *.
> 
> > > > +		goto out;
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> 
> get_user assumes aligned address. But there's no guarantee ctrl_pos
> is aligned because table_base_va might be misaligned.

table_base_va already passed access_ok() check.
> 
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> 
> Better to use != and not <, effectively same.
> Or in the above we can do:
> 	bool control;
> 	if (len == 4) {
> 		if (addr % 4)
> 			goto out;
> 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
>         } else if (len == 8) {
> 		if (addr % 8)
> 			goto out;
> 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
>         } else
> 		goto out;
> 
> > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > first?  I'd also prefer to see a direct goto out here since it's not
> > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > I'm not a fan of using an error code as a special non-error return
> > > either.
> > 
> > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check
> > new_ctrl to see if they indeed modified ctrl bits.
> > 
> > I would try to make the logic here more clear.
> > 
> > > > +	if (old_ctrl == new_ctrl)
> > > > +		goto out;
> > > > +
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > 
> > > An xor would remove some redundancy here
> > > 
> > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > 
> > Oh, yes, sure...
> 
> Also why not set ret to correct value directly?
> 
> > > > +	if (r || ret)
> > > > +		ret = -ENOTSYNC;
> 
> when is ret set? And why not to the correct value directly?
> 
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > +	.read     = msix_table_mmio_read,
> > > > +	.write    = msix_table_mmio_write,
> > > > +};
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > +	int r = 0, i;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > +			mmio = &mmio_dev->mmio[i];
> > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > +				r = -EINVAL;
> > > > +				goto out;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	/* All reserved currently */
> > > > +	if (mmio_user->flags) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if (!mmio) {
> > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > +			r = -ENOSPC;
> > > > +			goto out;
> > > > +		}
> > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > +	}
> > > > +
> > > > +	mmio_dev->mmio_nr++;
> > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > +	mmio->flags = mmio_user->flags;
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > +		mmio->table_base_va = mmio_user->base_va;
> 
> Also check alignment for these values. base_va should be
> at least 4 byte aligned. Maybe more, need to think about it ...
> Look in spec for table_base_addr alignment.

OK.

--
regards
Yang, Sheng

> 
> > > > +	}
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	int r = 0, i, j;
> > > 
> > > set r = -EINVAL here
> > > 
> > > > +	bool found = 0;
> > > > +
> > > > +	if (!mmio)
> > > > +		return 0;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > +			found = true;
> > > 
> > > set r = 0 here
> > > 
> > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > +			mmio_dev->mmio_nr--;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (!found)
> > > > +		r = -EINVAL;
> > > 
> > > then get rid of found
> > 
> > Sure.
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > new file mode 100644
> > > > index 0000000..01b6587
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.h
> > > > @@ -0,0 +1,25 @@
> > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > +#define __KVM_MSIX_MMIO_H__
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/pci.h>
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > *mmio_user); +
> > > > +#endif

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23 18:39         ` Michael S. Tsirkin
@ 2011-02-23 19:02           ` Alex Williamson
  0 siblings, 0 replies; 45+ messages in thread
From: Alex Williamson @ 2011-02-23 19:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm

On Wed, 2011-02-23 at 20:39 +0200, Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
> > On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > > int len, +				void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xf;
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > +	if (r)
> > > > > +		goto out;
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > > +				int len, const void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > +	gpa_t entry_base;
> > > > > +	u32 old_ctrl, new_ctrl;
> > > > > +	u32 *ctrl_pos;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xF;
> > > > 
> > > > nit, this 0xf above.
> > > 
> > > So you like another mask?
> > 
> > I'm just noting that above you used 0xf and here you used 0xF.
> > 
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > +
> > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > > +		goto out;
> > > > > +
> > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > > +		goto out;
> > > > > +
> > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > +		goto out;
> > > > > +
> > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > +		ret = -ENOTSYNC;
> > > > 
> > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > > first?  I'd also prefer to see a direct goto out here since it's not
> > > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > > I'm not a fan of using an error code as a special non-error return
> > > > either.
> > > 
> > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> > > see if they indeed modified ctrl bits.
> > > 
> > > I would try to make the logic here more clear. 
> > 
> > Isn't that what you're testing for though?
> > 
> > (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)
> > 
> > If this is true, we can only be writing address, upper address, or data,
> > not control.
> > 
> > (offset < PCI_MSIX_ENTRY_DATA && len == 8)
> > 
> > If this is true, we're writing address and upper address, not the
> > control vector.
> > 
> > Additionally, I think the size an alignment test should be more
> > restrictive.  We really only want to allow naturally aligned 4 or 8 byte
> > accesses.  Maybe instead of:
> > 
> > if ((addr & 0x3) || (len != 4 && len != 8))
> > 
> > we should do this:
> > 
> > if (!(len == 4 || len == 8) || addr & (len - 1))
> > 
> > Then the test could become:
> > 
> > if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
> > 
> > Thanks,
> > 
> > Alex
> 
> Or rather
> 
>  if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)
> 
> we are matching offset + length
> to control offset + control length,
> and avoid depending on the fact control is the last register.

That's just silly.  We're writing code to match a hardware
specification.  The registers always land at the same place, are always
the same size and always have the alignment requirements dictated in the
spec.  I see no problem depending on that.  They both do the same thing,
I prefer my version.

Alex




^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23 16:34       ` Alex Williamson
@ 2011-02-23 18:39         ` Michael S. Tsirkin
  2011-02-23 19:02           ` Alex Williamson
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-02-23 18:39 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm

On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
> On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > int len, +				void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xf;
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > +				int len, const void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> > > > +	u32 *ctrl_pos;
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xF;
> > > 
> > > nit, this 0xf above.
> > 
> > So you like another mask?
> 
> I'm just noting that above you used 0xf and here you used 0xF.
> 
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > +		goto out;
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> > > 
> > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > first?  I'd also prefer to see a direct goto out here since it's not
> > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > I'm not a fan of using an error code as a special non-error return
> > > either.
> > 
> > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> > see if they indeed modified ctrl bits.
> > 
> > I would try to make the logic here more clear. 
> 
> Isn't that what you're testing for though?
> 
> (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)
> 
> If this is true, we can only be writing address, upper address, or data,
> not control.
> 
> (offset < PCI_MSIX_ENTRY_DATA && len == 8)
> 
> If this is true, we're writing address and upper address, not the
> control vector.
> 
> Additionally, I think the size an alignment test should be more
> restrictive.  We really only want to allow naturally aligned 4 or 8 byte
> accesses.  Maybe instead of:
> 
> if ((addr & 0x3) || (len != 4 && len != 8))
> 
> we should do this:
> 
> if (!(len == 4 || len == 8) || addr & (len - 1))
> 
> Then the test could become:
> 
> if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
> 
> Thanks,
> 
> Alex

Or rather

 if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)

we are matching offset + length
to control offset + control length,
and avoid depending on the fact control is the last register.

-- 
MST

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  6:59     ` Sheng Yang
  2011-02-23  8:45       ` Michael S. Tsirkin
@ 2011-02-23 16:34       ` Alex Williamson
  2011-02-23 18:39         ` Michael S. Tsirkin
  1 sibling, 1 reply; 45+ messages in thread
From: Alex Williamson @ 2011-02-23 16:34 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xf;
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;
> > > +	u32 *ctrl_pos;
> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xF;
> > 
> > nit, this 0xf above.
> 
> So you like another mask?

I'm just noting that above you used 0xf and here you used 0xF.

> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > +		goto out;
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;
> > 
> > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > first?  I'd also prefer to see a direct goto out here since it's not
> > entirely obvious that this first 'if' always falls into the below 'if'.
> > I'm not a fan of using an error code as a special non-error return
> > either.
> 
> In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> see if they indeed modified ctrl bits.
> 
> I would try to make the logic here more clear. 

Isn't that what you're testing for though?

(offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)

If this is true, we can only be writing address, upper address, or data,
not control.

(offset < PCI_MSIX_ENTRY_DATA && len == 8)

If this is true, we're writing address and upper address, not the
control vector.

Additionally, I think the size an alignment test should be more
restrictive.  We really only want to allow naturally aligned 4 or 8 byte
accesses.  Maybe instead of:

if ((addr & 0x3) || (len != 4 && len != 8))

we should do this:

if (!(len == 4 || len == 8) || addr & (len - 1))

Then the test could become:

if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)

Thanks,

Alex


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  6:59     ` Sheng Yang
@ 2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-24  8:08         ` Sheng Yang
  2011-02-24  9:44         ` Sheng Yang
  2011-02-23 16:34       ` Alex Williamson
  1 sibling, 2 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2011-02-23  8:45 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > Then we can support mask bit operation of assigned devices now.
> > 
> > Looks pretty good overall.  A few comments below.  It seems like we
> > should be able to hook this into vfio with a small stub in kvm.  We just
> > need to be able to communicate disabling and enabling of individual msix
> > vectors.  For brute force, we could do this with a lot of eventfds,
> > triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
> > if there's something smaller that could do it.  Thanks,
> 
> Alex, thanks for your comments. See my comments below:
> 
> > 
> > Alex
> > 
> > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > ---
> > > 
> > >  arch/x86/kvm/Makefile    |    2 +-
> > >  arch/x86/kvm/x86.c       |    8 +-
> > >  include/linux/kvm.h      |   21 ++++
> > >  include/linux/kvm_host.h |   25 ++++
> > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > >  virt/kvm/msix_mmio.c     |  286
> > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> > >  |   25 ++++
> > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > >  create mode 100644 virt/kvm/msix_mmio.c
> > >  create mode 100644 virt/kvm/msix_mmio.h
> > > 
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index f15501f..3a0d851 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > 
> > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> > >  
> > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > 
> > > -				assigned-dev.o)
> > > +				assigned-dev.o msix_mmio.o)
> > > 
> > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > >  async_pf.o)
> > > 
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index fa708c9..89bf12c 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > 
> > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > >  	case KVM_CAP_XSAVE:
> > > 
> > >  	case KVM_CAP_ASYNC_PF:
> > > +	case KVM_CAP_MSIX_MMIO:
> > >  		r = 1;
> > >  		break;
> > >  	
> > >  	case KVM_CAP_COALESCED_MMIO:
> > > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > long addr,
> > > 
> > >  					   struct kvm_vcpu *vcpu)
> > >  
> > >  {
> > >  
> > >  	gpa_t                 gpa;
> > > 
> > > +	int r;
> > > 
> > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > 
> > > @@ -3822,14 +3824,16 @@ static int
> > > emulator_write_emulated_onepage(unsigned long addr,
> > > 
> > >  mmio:
> > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > 
> > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > 
> > >  	/*
> > >  	
> > >  	 * Is this MMIO handled locally?
> > >  	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > +	if (!r)
> > > 
> > >  		return X86EMUL_CONTINUE;
> > >  	
> > >  	vcpu->mmio_needed = 1;
> > > 
> > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;

This use of -ENOTSYNC is IMO confusing.
How about we make vcpu_mmio_write return the positive exit reason?
Negative value will mean an error.

> > > 
> > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > 
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index ea2dc1a..ad9df4b 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > 
> > >  #define KVM_EXIT_NMI              16
> > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > >  #define KVM_EXIT_OSI              18
> > > 
> > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > 
> > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > 
> > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > 
> > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > >  #define KVM_CAP_ASYNC_PF 59
> > > 
> > > +#define KVM_CAP_MSIX_MMIO 60
> > > 
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > 
> > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > 
> > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > 
> > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > > 0x7e, struct kvm_msix_mmio_user)
> > > 
> > >  /* Available with KVM_CAP_PIT_STATE2 */
> > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > >  struct kvm_pit_state2)
> > > 
> > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > 
> > >  	__u16 padding[3];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > +struct kvm_msix_mmio_user {
> > > +	__u32 dev_id;
> > > +	__u16 type;
> > > +	__u16 max_entries_nr;
> > > +	__u64 base_addr;
> > > +	__u64 base_va;
> > > +	__u64 flags;
> > > +	__u64 reserved[4];
> > > +};
> > > +
> > > 
> > >  #endif /* __LINUX_KVM_H */
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 7d313e0..c10670c 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > 
> > >  					KVM_PRIVATE_MEM_SLOTS];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_MAX    32
> > 
> > I assume this is based on 32 devices/bus, but we could exceed this is we
> > add more buses or make use of more functions.  How hard is it to make
> > this dynamic?
> 
> We can improve this later(maybe with balance tree?). 32 should be more than 
> sufficient for now. And this patchset has been in the mailing list for months, I 
> really want to get it in first...
>  
> > > +
> > > +struct kvm_msix_mmio {
> > > +	u32 dev_id;
> > > +	u16 type;
> > > +	u16 max_entries_nr;
> > > +	u64 flags;
> > > +	gpa_t table_base_addr;
> > > +	hva_t table_base_va;
> > > +	gpa_t pba_base_addr;
> > > +	hva_t pba_base_va;
> > > +};
> > > +
> > > +struct kvm_msix_mmio_dev {
> > > +	struct kvm *kvm;
> > > +	struct kvm_io_device table_dev;
> > > +	int mmio_nr;
> > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > +	struct mutex lock;
> > > +};
> > > +
> > > 
> > >  struct kvm {
> > >  
> > >  	spinlock_t mmu_lock;
> > >  	raw_spinlock_t requests_lock;
> > > 
> > > @@ -281,6 +302,7 @@ struct kvm {
> > > 
> > >  	long mmu_notifier_count;
> > >  
> > >  #endif
> > >  
> > >  	long tlbs_dirty;
> > > 
> > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > 
> > >  };
> > >  
> > >  /* The guest did something we don't support. */
> > > 
> > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > > 
> > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > 
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +			int assigned_dev_id, int entry, bool mask);
> > > +
> > > 
> > >  /* For vcpu->arch.iommu_flags */
> > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > 
> > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > index ae72ae6..d1598a6 100644
> > > --- a/virt/kvm/assigned-dev.c
> > > +++ b/virt/kvm/assigned-dev.c
> > > @@ -18,6 +18,7 @@
> > > 
> > >  #include <linux/interrupt.h>
> > >  #include <linux/slab.h>
> > >  #include "irq.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > >  list_head *head,
> > >  
> > >  						      int assigned_dev_id)
> > > 
> > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > > 
> > >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> > >  
> > >  }
> > > 
> > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > +				struct kvm_assigned_dev_kernel *adev)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = adev->assigned_dev_id;
> > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > 
> > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > >  
> > >  				     struct kvm_assigned_dev_kernel
> > >  				     *assigned_dev)
> > >  
> > >  {
> > >  
> > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > 
> > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > +
> > > 
> > >  	__pci_reset_function(assigned_dev->dev);
> > >  	pci_restore_state(assigned_dev->dev);
> > > 
> > > @@ -785,3 +799,33 @@ out:
> > >  	return r;
> > >  
> > >  }
> > > 
> > > +/* The caller should hold kvm->lock */
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +				int assigned_dev_id, int entry, bool mask)
> > > +{
> > > +	int r = -EFAULT;
> > > +	struct kvm_assigned_dev_kernel *adev;
> > > +	int i;
> > > +
> > > +	if (!irqchip_in_kernel(kvm))
> > > +		return r;
> > > +
> > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > +				      assigned_dev_id);
> > > +	if (!adev)
> > > +		goto out;
> > > +
> > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > +	for (i = 0; i < adev->entries_nr; i++)
> > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > +			if (mask)
> > > +				disable_irq_nosync(
> > > +					adev->host_msix_entries[i].vector);
> > > +			else
> > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > +			r = 0;
> > > +			break;
> > > +		}
> > > +out:
> > > +	return r;
> > > +}
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index b1b6cbb..b7807c8 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > > 
> > >  #include "coalesced_mmio.h"
> > >  #include "async_pf.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  #define CREATE_TRACE_POINTS
> > >  #include <trace/events/kvm.h>
> > > 
> > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > 
> > >  	struct mm_struct *mm = kvm->mm;
> > >  	
> > >  	kvm_arch_sync_events(kvm);
> > > 
> > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > 
> > >  	spin_lock(&kvm_lock);
> > >  	list_del(&kvm->vm_list);
> > >  	spin_unlock(&kvm_lock);
> > > 
> > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > 
> > >  		mutex_unlock(&kvm->lock);
> > >  		break;
> > >  
> > >  #endif
> > > 
> > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > 
> > >  	default:
> > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > >  		if (r == -ENOTTY)
> > > 
> > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > 
> > >  		return r;
> > >  	
> > >  	}
> > >  
> > >  #endif
> > > 
> > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > +	if (r < 0) {
> > > +		kvm_put_kvm(kvm);
> > > +		return r;
> > > +	}
> > > +
> > > 
> > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > >  	if (r < 0)
> > >  	
> > >  		kvm_put_kvm(kvm);
> > > 
> > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > *bus)
> > > 
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > >  
> > >  		     int len, const void *val)
> > >  
> > >  {
> > > 
> > > -	int i;
> > > +	int i, r = -EOPNOTSUPP;
> > > 
> > >  	struct kvm_io_bus *bus;
> > >  	
> > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > 
> > > -	for (i = 0; i < bus->dev_count; i++)
> > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > +	for (i = 0; i < bus->dev_count; i++) {
> > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > +		if (r == -ENOTSYNC)
> > > +			break;
> > > +		else if (!r)
> > > 
> > >  			return 0;

So the above will be
 if (r >= 0)
	 return r;

> > > 
> > > -	return -EOPNOTSUPP;
> > > +	}
> > > +	return r;
> > > 
> > >  }
> > >  
> > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > 
> > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > new file mode 100644
> > > index 0000000..dde9b62
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.c
> > > @@ -0,0 +1,286 @@
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/kvm.h>
> > > +
> > > +#include "msix_mmio.h"
> > > +#include "iodev.h"
> > > +
> > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio, +				int entry, u32 flag)
> > > +{
> > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > +				mmio->dev_id, entry, flag);
> > > +	return -EFAULT;
> > > +}
> > > +
> > > +/* Caller must hold dev->lock */
> > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > +				gpa_t addr, int len)
> > > +{
> > > +	gpa_t start, end;
> > > +	int i, r = -EINVAL;
> > > +
> > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > +		start = dev->mmio[i].table_base_addr;
> > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > +			dev->mmio[i].max_entries_nr;
> > > +		if (addr >= start && addr + len <= end) {
> > > +			r = i;
> > > +			break;
> > > +		}
> > 
> > Yet another potential user of the weight balanced tree ;)
> 
> Yeah, we can improve it later.
> > 
> > > +	}
> > > +
> > > +	return r;
> > > +}
> > > +
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xf;
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;

This is not as strict as it could be. Also
see below (in write) for an idea how to make the code clearer.

> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;

Want to give the caller some indication of an error?
If not add a comment why.

> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;

These are really __le32. Pls make them so and then
fix up the sparse errors.

> > > +	u32 *ctrl_pos;

get_user should not be done on u32 *, it should be
__user.  Does not sparse complain about this?

> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xF;
> > 
> > nit, this 0xf above.
> 
> So you like another mask?

Yes, this should be

	offset = addr % PCI_MSIX_ENTRY_SIZE;

> > 
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;

I think it's a good idea to be as strict as possible.
Also let's quote the spec. Finally using % will make
the logic more transparent. How about the below?

	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
	   result is undefined. */
	if (len == 4) {
		if (addr % 4)
			goto out;
        } else if (len == 8) {
		if (addr % 8)
			goto out;
        } else
		goto out;

As read needs same logic it's likely a good idea
to add a function for this.

> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))

It's probably a good idea to do access_ok check for the whole
entry here, and then use __get_user etc below, these are cheaper.

> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */

Do not allow.

> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))

any such cast is suspect.
Is entry_base really gpa? If it was you can not cast it to __user.
I think it should be void __user *.

> > > +		goto out;
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;

get_user assumes aligned address. But there's no guarantee ctrl_pos
is aligned because table_base_va might be misaligned.

> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;

Better to use != and not <, effectively same.
Or in the above we can do:
	bool control;
	if (len == 4) {
		if (addr % 4)
			goto out;
		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
        } else if (len == 8) {
		if (addr % 8)
			goto out;
		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
        } else
		goto out;


> > 
> > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > first?  I'd also prefer to see a direct goto out here since it's not
> > entirely obvious that this first 'if' always falls into the below 'if'.
> > I'm not a fan of using an error code as a special non-error return
> > either.
> 
> In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> see if they indeed modified ctrl bits.
> 
> I would try to make the logic here more clear. 
> > 
> > > +	if (old_ctrl == new_ctrl)
> > > +		goto out;
> > > +
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > 
> > An xor would remove some redundancy here
> > 
> > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> 
> Oh, yes, sure...

Also why not set ret to correct value directly?

> > 
> > > +	if (r || ret)
> > > +		ret = -ENOTSYNC;

when is ret set? And why not to the correct value directly?

> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > +	.read     = msix_table_mmio_read,
> > > +	.write    = msix_table_mmio_write,
> > > +};
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	struct kvm_msix_mmio *mmio = NULL;
> > > +	int r = 0, i;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > +			mmio = &mmio_dev->mmio[i];
> > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > +				r = -EINVAL;
> > > +				goto out;
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	/* All reserved currently */
> > > +	if (mmio_user->flags) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if (!mmio) {
> > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > +			r = -ENOSPC;
> > > +			goto out;
> > > +		}
> > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > +	}
> > > +
> > > +	mmio_dev->mmio_nr++;
> > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > +	mmio->dev_id = mmio_user->dev_id;
> > > +	mmio->flags = mmio_user->flags;
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > +		mmio->table_base_va = mmio_user->base_va;

Also check alignment for these values. base_va should be
at least 4 byte aligned. Maybe more, need to think about it ...
Look in spec for table_base_addr alignment.


> > > +	}
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	int r = 0, i, j;
> > 
> > set r = -EINVAL here
> > 
> > > +	bool found = 0;
> > > +
> > > +	if (!mmio)
> > > +		return 0;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > +			found = true;
> > 
> > set r = 0 here
> > 
> > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > +			mmio_dev->mmio_nr--;
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (!found)
> > > +		r = -EINVAL;
> > 
> > then get rid of found
> 
> Sure.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > new file mode 100644
> > > index 0000000..01b6587
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.h
> > > @@ -0,0 +1,25 @@
> > > +#ifndef __KVM_MSIX_MMIO_H__
> > > +#define __KVM_MSIX_MMIO_H__
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/pci.h>
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio_user); +
> > > +#endif

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  0:19   ` Alex Williamson
@ 2011-02-23  6:59     ` Sheng Yang
  2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-23 16:34       ` Alex Williamson
  0 siblings, 2 replies; 45+ messages in thread
From: Sheng Yang @ 2011-02-23  6:59 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> 
> Looks pretty good overall.  A few comments below.  It seems like we
> should be able to hook this into vfio with a small stub in kvm.  We just
> need to be able to communicate disabling and enabling of individual msix
> vectors.  For brute force, we could do this with a lot of eventfds,
> triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
> if there's something smaller that could do it.  Thanks,

Alex, thanks for your comments. See my comments below:

> 
> Alex
> 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > ---
> > 
> >  arch/x86/kvm/Makefile    |    2 +-
> >  arch/x86/kvm/x86.c       |    8 +-
> >  include/linux/kvm.h      |   21 ++++
> >  include/linux/kvm_host.h |   25 ++++
> >  virt/kvm/assigned-dev.c  |   44 +++++++
> >  virt/kvm/kvm_main.c      |   38 ++++++-
> >  virt/kvm/msix_mmio.c     |  286
> >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> >  |   25 ++++
> >  8 files changed, 442 insertions(+), 7 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index f15501f..3a0d851 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > 
> >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> >  
> >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > 
> > -				assigned-dev.o)
> > +				assigned-dev.o msix_mmio.o)
> > 
> >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> >  async_pf.o)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index fa708c9..89bf12c 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 
> >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >  	case KVM_CAP_XSAVE:
> > 
> >  	case KVM_CAP_ASYNC_PF:
> > +	case KVM_CAP_MSIX_MMIO:
> >  		r = 1;
> >  		break;
> >  	
> >  	case KVM_CAP_COALESCED_MMIO:
> > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  					   struct kvm_vcpu *vcpu)
> >  
> >  {
> >  
> >  	gpa_t                 gpa;
> > 
> > +	int r;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3822,14 +3824,16 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >  mmio:
> >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > 
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > +	if (!r)
> > 
> >  		return X86EMUL_CONTINUE;
> >  	
> >  	vcpu->mmio_needed = 1;
> > 
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index ea2dc1a..ad9df4b 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > 
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> > 
> > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > 
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > 
> > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > 
> >  #define KVM_CAP_PPC_GET_PVINFO 57
> >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> >  #define KVM_CAP_ASYNC_PF 59
> > 
> > +#define KVM_CAP_MSIX_MMIO 60
> > 
> >  #ifdef KVM_CAP_IRQ_ROUTING
> > 
> > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > 
> >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > 
> > +/* Available with KVM_CAP_MSIX_MMIO */
> > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > 0x7e, struct kvm_msix_mmio_user)
> > 
> >  /* Available with KVM_CAP_PIT_STATE2 */
> >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> >  struct kvm_pit_state2)
> > 
> > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > 
> >  	__u16 padding[3];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> >  #endif /* __LINUX_KVM_H */
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7d313e0..c10670c 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > 
> >  					KVM_PRIVATE_MEM_SLOTS];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_MAX    32
> 
> I assume this is based on 32 devices/bus, but we could exceed this is we
> add more buses or make use of more functions.  How hard is it to make
> this dynamic?

We can improve this later(maybe with balance tree?). 32 should be more than 
sufficient for now. And this patchset has been in the mailing list for months, I 
really want to get it in first...
 
> > +
> > +struct kvm_msix_mmio {
> > +	u32 dev_id;
> > +	u16 type;
> > +	u16 max_entries_nr;
> > +	u64 flags;
> > +	gpa_t table_base_addr;
> > +	hva_t table_base_va;
> > +	gpa_t pba_base_addr;
> > +	hva_t pba_base_va;
> > +};
> > +
> > +struct kvm_msix_mmio_dev {
> > +	struct kvm *kvm;
> > +	struct kvm_io_device table_dev;
> > +	int mmio_nr;
> > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > +	struct mutex lock;
> > +};
> > +
> > 
> >  struct kvm {
> >  
> >  	spinlock_t mmu_lock;
> >  	raw_spinlock_t requests_lock;
> > 
> > @@ -281,6 +302,7 @@ struct kvm {
> > 
> >  	long mmu_notifier_count;
> >  
> >  #endif
> >  
> >  	long tlbs_dirty;
> > 
> > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > 
> >  };
> >  
> >  /* The guest did something we don't support. */
> > 
> > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > 
> >  int kvm_request_irq_source_id(struct kvm *kvm);
> >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +			int assigned_dev_id, int entry, bool mask);
> > +
> > 
> >  /* For vcpu->arch.iommu_flags */
> >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..d1598a6 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +/* The caller should hold kvm->lock */
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index b1b6cbb..b7807c8 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> > 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..dde9b62
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,286 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > +	return -EFAULT;
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> 
> Yet another potential user of the weight balanced tree ;)

Yeah, we can improve it later.
> 
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xf;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +	u32 *ctrl_pos;
> > +
> > +	mutex_lock(&mmio_dev->kvm->lock);
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xF;
> 
> nit, this 0xf above.

So you like another mask?
> 
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* No allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > +		goto out;
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > +		ret = -ENOTSYNC;
> 
> Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> first?  I'd also prefer to see a direct goto out here since it's not
> entirely obvious that this first 'if' always falls into the below 'if'.
> I'm not a fan of using an error code as a special non-error return
> either.

In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
see if they indeed modified ctrl bits.

I would try to make the logic here more clear. 
> 
> > +	if (old_ctrl == new_ctrl)
> > +		goto out;
> > +
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> 
> An xor would remove some redundancy here
> 
> if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));

Oh, yes, sure...
> 
> > +	if (r || ret)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	mutex_unlock(&mmio_dev->kvm->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +	}
> > +
> > +	mmio_dev->mmio_nr++;
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = 0, i, j;
> 
> set r = -EINVAL here
> 
> > +	bool found = 0;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			found = true;
> 
> set r = 0 here
> 
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	if (!found)
> > +		r = -EINVAL;
> 
> then get rid of found

Sure.

--
regards
Yang, Sheng

> 
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > new file mode 100644
> > index 0000000..01b6587
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __KVM_MSIX_MMIO_H__
> > +#define __KVM_MSIX_MMIO_H__
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/pci.h>
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio_user); +
> > +#endif

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-02-03  1:05   ` Marcelo Tosatti
@ 2011-02-23  0:19   ` Alex Williamson
  2011-02-23  6:59     ` Sheng Yang
  1 sibling, 1 reply; 45+ messages in thread
From: Alex Williamson @ 2011-02-23  0:19 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.

Looks pretty good overall.  A few comments below.  It seems like we
should be able to hook this into vfio with a small stub in kvm.  We just
need to be able to communicate disabling and enabling of individual msix
vectors.  For brute force, we could do this with a lot of eventfds,
triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
if there's something smaller that could do it.  Thanks,

Alex

> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   21 ++++
>  include/linux/kvm_host.h |   25 ++++
>  virt/kvm/assigned-dev.c  |   44 +++++++
>  virt/kvm/kvm_main.c      |   38 ++++++-
>  virt/kvm/msix_mmio.c     |  286 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h     |   25 ++++
>  8 files changed, 442 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..3a0d851 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
>  
>  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
>  				coalesced_mmio.o irq_comm.o eventfd.o \
> -				assigned-dev.o)
> +				assigned-dev.o msix_mmio.o)
>  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fa708c9..89bf12c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
> +	case KVM_CAP_MSIX_MMIO:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  					   struct kvm_vcpu *vcpu)
>  {
>  	gpa_t                 gpa;
> +	int r;
>  
>  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
>  
> @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  
>  mmio:
>  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
>  	/*
>  	 * Is this MMIO handled locally?
>  	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> +	if (!r)
>  		return X86EMUL_CONTINUE;
>  
>  	vcpu->mmio_needed = 1;
> -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
>  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
>  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
>  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..ad9df4b 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>  #define KVM_EXIT_NMI              16
>  #define KVM_EXIT_INTERNAL_ERROR   17
>  #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_GET_PVINFO 57
>  #define KVM_CAP_PPC_IRQ_LEVEL 58
>  #define KVM_CAP_ASYNC_PF 59
> +#define KVM_CAP_MSIX_MMIO 60
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -672,6 +674,9 @@ struct kvm_clock_data {
>  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +/* Available with KVM_CAP_MSIX_MMIO */
> +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
> +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
>  	__u16 padding[3];
>  };
>  
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> +
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> +
> +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;
> +	__u16 max_entries_nr;
> +	__u64 base_addr;
> +	__u64 base_va;
> +	__u64 flags;
> +	__u64 reserved[4];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7d313e0..c10670c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -233,6 +233,27 @@ struct kvm_memslots {
>  					KVM_PRIVATE_MEM_SLOTS];
>  };
>  
> +#define KVM_MSIX_MMIO_MAX    32

I assume this is based on 32 devices/bus, but we could exceed this is we
add more buses or make use of more functions.  How hard is it to make
this dynamic?

> +
> +struct kvm_msix_mmio {
> +	u32 dev_id;
> +	u16 type;
> +	u16 max_entries_nr;
> +	u64 flags;
> +	gpa_t table_base_addr;
> +	hva_t table_base_va;
> +	gpa_t pba_base_addr;
> +	hva_t pba_base_va;
> +};
> +
> +struct kvm_msix_mmio_dev {
> +	struct kvm *kvm;
> +	struct kvm_io_device table_dev;
> +	int mmio_nr;
> +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> +	struct mutex lock;
> +};
> +
>  struct kvm {
>  	spinlock_t mmu_lock;
>  	raw_spinlock_t requests_lock;
> @@ -281,6 +302,7 @@ struct kvm {
>  	long mmu_notifier_count;
>  #endif
>  	long tlbs_dirty;
> +	struct kvm_msix_mmio_dev msix_mmio_dev;
>  };
>  
>  /* The guest did something we don't support. */
> @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
>  int kvm_request_irq_source_id(struct kvm *kvm);
>  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +			int assigned_dev_id, int entry, bool mask);
> +
>  /* For vcpu->arch.iommu_flags */
>  #define KVM_IOMMU_CACHE_COHERENCY	0x1
>  
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..d1598a6 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +/* The caller should hold kvm->lock */
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	/* For non-MSIX enabled devices, entries_nr == 0 */
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..b7807c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..dde9b62
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,286 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}

Yet another potential user of the weight balanced tree ;)

> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->kvm->lock);
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xF;

nit, this 0xf above.

> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> +		ret = -ENOTSYNC;

Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
first?  I'd also prefer to see a direct goto out here since it's not
entirely obvious that this first 'if' always falls into the below 'if'.
I'm not a fan of using an error code as a special non-error return
either.

> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);

An xor would remove some redundancy here

if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
	r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));

> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	mutex_unlock(&mmio_dev->kvm->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +	}
> +
> +	mmio_dev->mmio_nr++;
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = 0, i, j;

set r = -EINVAL here

> +	bool found = 0;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			found = true;

set r = 0 here

> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		r = -EINVAL;

then get rid of found

> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
> diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> new file mode 100644
> index 0000000..01b6587
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.h
> @@ -0,0 +1,25 @@
> +#ifndef __KVM_MSIX_MMIO_H__
> +#define __KVM_MSIX_MMIO_H__
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/pci.h>
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user);
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user);
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
> +
> +#endif




^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-18  8:15     ` Sheng Yang
@ 2011-02-22 17:58       ` Marcelo Tosatti
  0 siblings, 0 replies; 45+ messages in thread
From: Marcelo Tosatti @ 2011-02-22 17:58 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Fri, Feb 18, 2011 at 04:15:38PM +0800, Sheng Yang wrote:
> > Why allow this ioctl to succeed if there's an entry already present?
> > This case is broken as mmio_dev->mmio_nr is increased below.
> 
> Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.
> 
> The reason we allow multiply callings is userspace can register different types of 
> address here(table address and PBA address).

Ah OK. 

> > PCI bits must be reviewed...
> 
> Pardon? PCI related things are already in 2.6.38-rc.

I meant someone more familiar with PCI should review this patch,
Michael/Alex, CC'ed.


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-03  1:05   ` Marcelo Tosatti
@ 2011-02-18  8:15     ` Sheng Yang
  2011-02-22 17:58       ` Marcelo Tosatti
  0 siblings, 1 reply; 45+ messages in thread
From: Sheng Yang @ 2011-02-18  8:15 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote:
> On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > 
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> 
> Why allow this ioctl to succeed if there's an entry already present?
> This case is broken as mmio_dev->mmio_nr is increased below.

Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.

The reason we allow multiply callings is userspace can register different types of 
address here(table address and PBA address).

> PCI bits must be reviewed...

Pardon? PCI related things are already in 2.6.38-rc.

--
regards
Yang, Sheng



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-03  1:05   ` Marcelo Tosatti
  2011-02-18  8:15     ` Sheng Yang
  2011-02-23  0:19   ` Alex Williamson
  1 sibling, 1 reply; 45+ messages in thread
From: Marcelo Tosatti @ 2011-02-03  1:05 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>

> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}

Why allow this ioctl to succeed if there's an entry already present?
This case is broken as mmio_dev->mmio_nr is increased below.

PCI bits must be reviewed...


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 [PATCH 0/3 v8] " Sheng Yang
@ 2011-01-30  5:11 ` Sheng Yang
  2011-02-03  1:05   ` Marcelo Tosatti
  2011-02-23  0:19   ` Alex Williamson
  0 siblings, 2 replies; 45+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   21 ++++
 include/linux/kvm_host.h |   25 ++++
 virt/kvm/assigned-dev.c  |   44 +++++++
 virt/kvm/kvm_main.c      |   38 ++++++-
 virt/kvm/msix_mmio.c     |  286 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   25 ++++
 8 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
+		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -281,6 +302,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..b7807c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..dde9b62
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,286 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xf;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+	u32 *ctrl_pos;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xF;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* No allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
+	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
+		ret = -ENOTSYNC;
+	if (old_ctrl == new_ctrl)
+		goto out;
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
+	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
+	if (r || ret)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+	}
+
+	mmio_dev->mmio_nr++;
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = 0, i, j;
+	bool found = 0;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			found = true;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	if (!found)
+		r = -EINVAL;
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2011-02-25  8:46 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-01-17 11:54   ` Marcelo Tosatti
2011-01-17 12:18     ` Sheng Yang
2011-01-17 12:18       ` Avi Kivity
2011-01-17 12:48         ` Marcelo Tosatti
2011-01-17 12:51           ` Avi Kivity
2011-01-17 15:52             ` Marcelo Tosatti
2011-01-17 16:01               ` Avi Kivity
2011-01-17 12:39       ` Marcelo Tosatti
2011-01-19  8:37         ` Sheng Yang
2011-01-17 12:29   ` Avi Kivity
2011-01-17 13:31     ` Michael S. Tsirkin
2011-01-17 13:47     ` Jan Kiszka
2011-01-30  4:38     ` Sheng Yang
2011-01-31 13:09       ` Avi Kivity
2011-02-01  4:21         ` Sheng Yang
2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
2011-01-17 12:21   ` Avi Kivity
2011-01-17 12:35     ` Sheng Yang
2011-01-17 12:45       ` Avi Kivity
2011-01-19  8:21         ` Sheng Yang
2011-01-25 12:47           ` Avi Kivity
2011-01-26  9:05             ` Sheng Yang
2011-01-31 13:24               ` Avi Kivity
2011-02-01  4:26                 ` Sheng Yang
2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
2011-01-30  5:11 [PATCH 0/3 v8] " Sheng Yang
2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-02-03  1:05   ` Marcelo Tosatti
2011-02-18  8:15     ` Sheng Yang
2011-02-22 17:58       ` Marcelo Tosatti
2011-02-23  0:19   ` Alex Williamson
2011-02-23  6:59     ` Sheng Yang
2011-02-23  8:45       ` Michael S. Tsirkin
2011-02-24  8:08         ` Sheng Yang
2011-02-24 10:11           ` Michael S. Tsirkin
2011-02-25  5:54             ` Sheng Yang
2011-02-24  9:44         ` Sheng Yang
2011-02-24 10:17           ` Michael S. Tsirkin
2011-02-25  6:12             ` Sheng Yang
2011-02-25  8:46               ` Michael S. Tsirkin
2011-02-23 16:34       ` Alex Williamson
2011-02-23 18:39         ` Michael S. Tsirkin
2011-02-23 19:02           ` Alex Williamson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.