All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3 v7] MSI-X MMIO support for KVM
@ 2011-01-06 10:19 Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Change from v6:
1. Discard PBA support. But we can still add it later.
2. Fix one memory reference bug
3. Add automatically MMIO unregister after device was deassigned.
4. Update according to Avi's comments.
5. Add documents for new API.

Notice this patchset depends on two PCI patches named:

PCI: MSI: Move MSI-X entry definition to pci_regs.h
PCI: Add mask bit definition for MSI-X table

These two patches are in the Jesse's pci-2.6 tree. Do I need to repost them?

Sheng Yang (3):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt |   41 +++++++
 arch/x86/kvm/Makefile     |    2 +-
 arch/x86/kvm/x86.c        |    8 +-
 include/linux/kvm.h       |   21 ++++
 include/linux/kvm_host.h  |   48 ++++++++
 virt/kvm/assigned-dev.c   |   44 +++++++
 virt/kvm/iodev.h          |   25 +----
 virt/kvm/kvm_main.c       |   38 ++++++-
 virt/kvm/msix_mmio.c      |  284 +++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h      |   25 ++++
 10 files changed, 505 insertions(+), 31 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-17 11:54   ` Marcelo Tosatti
  2011-01-17 12:29   ` Avi Kivity
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
  2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  3 siblings, 2 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   21 ++++
 include/linux/kvm_host.h |   25 ++++
 virt/kvm/assigned-dev.c  |   44 +++++++
 virt/kvm/kvm_main.c      |   38 ++++++-
 virt/kvm/msix_mmio.c     |  284 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   25 ++++
 8 files changed, 440 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
+		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -281,6 +302,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..7444dcd 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..b7807c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..dc6e8ca
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,284 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xf;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+	u32 *ctrl_pos;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xF;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* No allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
+	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
+		ret = -ENOTSYNC;
+	if (old_ctrl == new_ctrl)
+		goto out;
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
+	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
+	if (r || ret)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+	}
+
+	mmio_dev->mmio_nr++;
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = 0, i, j;
+	bool found = 0;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			found = true;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	if (!found)
+		r = -EINVAL;
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-01-06 10:19 ` Sheng Yang
  2011-01-17 12:21   ` Avi Kivity
  2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
  3 siblings, 1 reply; 29+ messages in thread
From: Sheng Yang @ 2011-01-06 10:19 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..4978b94 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/3 v7] MSI-X MMIO support for KVM
  2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
                   ` (2 preceding siblings ...)
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
@ 2011-01-12  2:23 ` Sheng Yang
  3 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-12  2:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Thursday 06 January 2011 18:19:42 Sheng Yang wrote:
> Change from v6:
> 1. Discard PBA support. But we can still add it later.
> 2. Fix one memory reference bug
> 3. Add automatically MMIO unregister after device was deassigned.
> 4. Update according to Avi's comments.
> 5. Add documents for new API.

Avi?

--
regards
Yang, Sheng

> 
> Notice this patchset depends on two PCI patches named:
> 
> PCI: MSI: Move MSI-X entry definition to pci_regs.h
> PCI: Add mask bit definition for MSI-X table
> 
> These two patches are in the Jesse's pci-2.6 tree. Do I need to repost
> them?
> 
> Sheng Yang (3):
>   KVM: Move struct kvm_io_device to kvm_host.h
>   KVM: Emulate MSI-X table in kernel
>   KVM: Add documents for MSI-X MMIO API
> 
>  Documentation/kvm/api.txt |   41 +++++++
>  arch/x86/kvm/Makefile     |    2 +-
>  arch/x86/kvm/x86.c        |    8 +-
>  include/linux/kvm.h       |   21 ++++
>  include/linux/kvm_host.h  |   48 ++++++++
>  virt/kvm/assigned-dev.c   |   44 +++++++
>  virt/kvm/iodev.h          |   25 +----
>  virt/kvm/kvm_main.c       |   38 ++++++-
>  virt/kvm/msix_mmio.c      |  284
> +++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h      | 
>  25 ++++
>  10 files changed, 505 insertions(+), 31 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-01-17 11:54   ` Marcelo Tosatti
  2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:29   ` Avi Kivity
  1 sibling, 1 reply; 29+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 11:54 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   21 ++++
>  include/linux/kvm_host.h |   25 ++++
>  virt/kvm/assigned-dev.c  |   44 +++++++
>  virt/kvm/kvm_main.c      |   38 ++++++-
>  virt/kvm/msix_mmio.c     |  284 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h     |   25 ++++
>  8 files changed, 440 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..7444dcd 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}

Should check if the host irq is registered as MSIX type.

> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..b7807c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..dc6e8ca
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,284 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xF;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;

Instead of copying to/from userspace (which is subject to swapin,
unexpected values), you could include the guest written value in a
kvm_run structure, along with address. Qemu-kvm would use that to
synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> +		ret = -ENOTSYNC;
> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);

Then you rely on the kernel copy of the values to enable/disable irq.

> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +	}
> +
> +	mmio_dev->mmio_nr++;
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = 0, i, j;
> +	bool found = 0;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);

Its valid for mmio_nr to be equal to KVM_MSIX_MMIO_MAX.

> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			found = true;
> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		r = -EINVAL;
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}

This is not consistent with registration, where there are no checks
regarding validity of assigned device id. So why is it necessary?

There is a lock ordering problem BTW: 

- read/write handlers: dev->lock -> kvm->lock
- vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock

> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 11:54   ` Marcelo Tosatti
@ 2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:18       ` Avi Kivity
  2011-01-17 12:39       ` Marcelo Tosatti
  0 siblings, 2 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-17 12:18 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Monday 17 January 2011 19:54:47 Marcelo Tosatti wrote:
> On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > ---
> > 
> >  arch/x86/kvm/Makefile    |    2 +-
> >  arch/x86/kvm/x86.c       |    8 +-
> >  include/linux/kvm.h      |   21 ++++
> >  include/linux/kvm_host.h |   25 ++++
> >  virt/kvm/assigned-dev.c  |   44 +++++++
> >  virt/kvm/kvm_main.c      |   38 ++++++-
> >  virt/kvm/msix_mmio.c     |  284
> >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> >  |   25 ++++
> >  8 files changed, 440 insertions(+), 7 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..7444dcd 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> 
> Should check if the host irq is registered as MSIX type.
> 
> > +out:
> > +	mutex_unlock(&kvm->lock);
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index b1b6cbb..b7807c8 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> > 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..dc6e8ca
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,284 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > +	return -EFAULT;
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xf;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +	u32 *ctrl_pos;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xF;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* No allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > +		goto out;
> 
> Instead of copying to/from userspace (which is subject to swapin,
> unexpected values), you could include the guest written value in a
> kvm_run structure, along with address. Qemu-kvm would use that to
> synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in 
the most condition. That's the cost we want to optimize. Also it's possible to 
userspace to read the correct value of MMIO(but mostly userspace can't write to it 
in order to prevent synchronize issue).
> 
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > +		ret = -ENOTSYNC;
> > +	if (old_ctrl == new_ctrl)
> > +		goto out;
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> 
> Then you rely on the kernel copy of the values to enable/disable irq.

Yes, they are guest owned assigned IRQs. Any potential issue?
> 
> > +	if (r || ret)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +	}
> > +
> > +	mmio_dev->mmio_nr++;
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = 0, i, j;
> > +	bool found = 0;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);
> 
> Its valid for mmio_nr to be equal to KVM_MSIX_MMIO_MAX.
> 
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			found = true;
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	if (!found)
> > +		r = -EINVAL;
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> 
> This is not consistent with registration, where there are no checks
> regarding validity of assigned device id. So why is it necessary?

I am not quite understand. We need to free mmio anyway, otherwise it would result 
in wrong mmio interception...

> 
> There is a lock ordering problem BTW:
> 
> - read/write handlers: dev->lock -> kvm->lock
> - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock

Good catch! Would fix it(and other comments of course).

--
regards
Yang, Sheng

> 
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18     ` Sheng Yang
@ 2011-01-17 12:18       ` Avi Kivity
  2011-01-17 12:48         ` Marcelo Tosatti
  2011-01-17 12:39       ` Marcelo Tosatti
  1 sibling, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 12:18 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >  >  +
> >  >  +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >  >  +		goto out;
> >
> >  Instead of copying to/from userspace (which is subject to swapin,
> >  unexpected values), you could include the guest written value in a
> >  kvm_run structure, along with address. Qemu-kvm would use that to
> >  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
>
> We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> the most condition. That's the cost we want to optimize. Also it's possible to
> userspace to read the correct value of MMIO(but mostly userspace can't write to it
> in order to prevent synchronize issue).

It's also good to have the values in just one place; using userspace 
makes it easy for both the kernel and userspace to see the values (and 
set them after migration, if/when we extend this to virtio).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
@ 2011-01-17 12:21   ` Avi Kivity
  2011-01-17 12:35     ` Sheng Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 12:21 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/06/2011 12:19 PM, Sheng Yang wrote:
> Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> ---
>   Documentation/kvm/api.txt |   41 +++++++++++++++++++++++++++++++++++++++++
>   1 files changed, 41 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> index e1a9297..4978b94 100644
> --- a/Documentation/kvm/api.txt
> +++ b/Documentation/kvm/api.txt
> @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
>   	__u16 padding[3];
>   };
>
> +4.54 KVM_REGISTER_MSIX_MMIO
> +
> +Capability: KVM_CAP_MSIX_MMIO
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_msix_mmio_user (in)
> +Returns: 0 on success, -1 on error
> +
> +This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
> +operation would be handled by kernel. When necessary(e.g. MSI data/address
> +changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
> +indicate the MMIO modification and require userspace to update IRQ routing
> +table.
> +
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;		/* Device type and MMIO address type */
> +	__u16 max_entries_nr;	/* Maximum entries supported */
> +	__u64 base_addr;	/* Guest physical address of MMIO */
> +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> +	__u64 flags;		/* Reserved for now */
> +	__u64 reserved[4];
> +};
> +
> +Current device type can be:
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> +
> +Current MMIO type can be:
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> +

How does userspace know which entry of which table changed?  Need a 
field in struct kvm_run for that.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-01-17 11:54   ` Marcelo Tosatti
@ 2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
                       ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 12:29 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/06/2011 12:19 PM, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
>
>
>
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i<  adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);

Is it okay to call disable_irq_nosync() here?  IIRC we don't check the 
mask bit on irq delivery, so we may forward an interrupt to the guest 
after the mask bit was set.

What does pci say about the mask bit?  when does it take effect?

Another question is whether disable_irq_nosync() actually programs the 
device mask bit, or not.  If it does, then it's slow, and it may be 
better to leave interrupts enabled but have an internal pending bit.  If 
it doesn't program the mask bit, it's fine.

> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
>
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;
> +
> +	offset = addr&  0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> +		goto out;
> +
> +	mmio =&mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;

and return ret == 0?

> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;
> +
> +	offset = addr&  0xF;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> +		goto out;
> +
> +	mmio =&mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;

here, too.

> +
> +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> +		ret = -ENOTSYNC;

goto out?

> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:21   ` Avi Kivity
@ 2011-01-17 12:35     ` Sheng Yang
  2011-01-17 12:45       ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Sheng Yang @ 2011-01-17 12:35 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> > Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> > ---
> > 
> >   Documentation/kvm/api.txt |   41
> >   +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> >   insertions(+), 0 deletions(-)
> > 
> > diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> > index e1a9297..4978b94 100644
> > --- a/Documentation/kvm/api.txt
> > +++ b/Documentation/kvm/api.txt
> > @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> > 
> >   	__u16 padding[3];
> >   
> >   };
> > 
> > +4.54 KVM_REGISTER_MSIX_MMIO
> > +
> > +Capability: KVM_CAP_MSIX_MMIO
> > +Architectures: x86
> > +Type: vm ioctl
> > +Parameters: struct kvm_msix_mmio_user (in)
> > +Returns: 0 on success, -1 on error
> > +
> > +This API indicates an MSI-X MMIO address of a guest device. Then all
> > MMIO +operation would be handled by kernel. When necessary(e.g. MSI
> > data/address +changed), KVM would exit to userspace using
> > KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> > require userspace to update IRQ routing +table.
> > +
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;		/* Device type and MMIO address type */
> > +	__u16 max_entries_nr;	/* Maximum entries supported */
> > +	__u64 base_addr;	/* Guest physical address of MMIO */
> > +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> > +	__u64 flags;		/* Reserved for now */
> > +	__u64 reserved[4];
> > +};
> > +
> > +Current device type can be:
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > +
> > +Current MMIO type can be:
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > +
> 
> How does userspace know which entry of which table changed?  Need a
> field in struct kvm_run for that.

We already got an guest MMIO address for that in the exit information. I've 
created a chain of handler in qemu to handle it.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18     ` Sheng Yang
  2011-01-17 12:18       ` Avi Kivity
@ 2011-01-17 12:39       ` Marcelo Tosatti
  2011-01-19  8:37         ` Sheng Yang
  1 sibling, 1 reply; 29+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 12:39 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > +		goto out;
> > 
> > Instead of copying to/from userspace (which is subject to swapin,
> > unexpected values), you could include the guest written value in a
> > kvm_run structure, along with address. Qemu-kvm would use that to
> > synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> 
> We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in 
> the most condition. That's the cost we want to optimize. Also it's possible to 
> userspace to read the correct value of MMIO(but mostly userspace can't write to it 
> in order to prevent synchronize issue).

Yes, i meant exit to userspace only when necessary, but instead of
copying directly everytime record the value the guest has written in
kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.

> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;
> > > +	if (old_ctrl == new_ctrl)
> > > +		goto out;
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > 
> > Then you rely on the kernel copy of the values to enable/disable irq.
> 
> Yes, they are guest owned assigned IRQs. Any potential issue?

Nothing prevents the kernel from enabling or disabling irq multiple
times with this code (what prevents it is a qemu-kvm that behaves as
expected). This is not very good.

Perhaps the guest can only harm itself with that, but i'm not sure.

And also if an msix table page is swapped out guest will hang.

> > > +	return r;
> > > +}
> > 
> > This is not consistent with registration, where there are no checks
> > regarding validity of assigned device id. So why is it necessary?
> 
> I am not quite understand. We need to free mmio anyway, otherwise it would result 
> in wrong mmio interception...

If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
read/write paths, there is no need to free anything.

> 
> > 
> > There is a lock ordering problem BTW:
> > 
> > - read/write handlers: dev->lock -> kvm->lock
> > - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock
> 
> Good catch! Would fix it(and other comments of course).
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:35     ` Sheng Yang
@ 2011-01-17 12:45       ` Avi Kivity
  2011-01-19  8:21         ` Sheng Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 12:45 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/17/2011 02:35 PM, Sheng Yang wrote:
> On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >  >  Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> >  >  ---
> >  >
> >  >    Documentation/kvm/api.txt |   41
> >  >    +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> >  >    insertions(+), 0 deletions(-)
> >  >
> >  >  diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> >  >  index e1a9297..4978b94 100644
> >  >  --- a/Documentation/kvm/api.txt
> >  >  +++ b/Documentation/kvm/api.txt
> >  >  @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> >  >
> >  >    	__u16 padding[3];
> >  >
> >  >    };
> >  >
> >  >  +4.54 KVM_REGISTER_MSIX_MMIO
> >  >  +
> >  >  +Capability: KVM_CAP_MSIX_MMIO
> >  >  +Architectures: x86
> >  >  +Type: vm ioctl
> >  >  +Parameters: struct kvm_msix_mmio_user (in)
> >  >  +Returns: 0 on success, -1 on error
> >  >  +
> >  >  +This API indicates an MSI-X MMIO address of a guest device. Then all
> >  >  MMIO +operation would be handled by kernel. When necessary(e.g. MSI
> >  >  data/address +changed), KVM would exit to userspace using
> >  >  KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> >  >  require userspace to update IRQ routing +table.
> >  >  +
> >  >  +struct kvm_msix_mmio_user {
> >  >  +	__u32 dev_id;
> >  >  +	__u16 type;		/* Device type and MMIO address type */
> >  >  +	__u16 max_entries_nr;	/* Maximum entries supported */
> >  >  +	__u64 base_addr;	/* Guest physical address of MMIO */
> >  >  +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> >  >  +	__u64 flags;		/* Reserved for now */
> >  >  +	__u64 reserved[4];
> >  >  +};
> >  >  +
> >  >  +Current device type can be:
> >  >  +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<   0)
> >  >  +
> >  >  +Current MMIO type can be:
> >  >  +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<   8)
> >  >  +
> >
> >  How does userspace know which entry of which table changed?  Need a
> >  field in struct kvm_run for that.
>
> We already got an guest MMIO address for that in the exit information. I've
> created a chain of handler in qemu to handle it.
>

But we already decoded the table and entry...

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:18       ` Avi Kivity
@ 2011-01-17 12:48         ` Marcelo Tosatti
  2011-01-17 12:51           ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 12:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >>  >  +
> >>  >  +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >>  >  +		goto out;
> >>
> >>  Instead of copying to/from userspace (which is subject to swapin,
> >>  unexpected values), you could include the guest written value in a
> >>  kvm_run structure, along with address. Qemu-kvm would use that to
> >>  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >
> >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >the most condition. That's the cost we want to optimize. Also it's possible to
> >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >in order to prevent synchronize issue).
> 
> It's also good to have the values in just one place; using userspace
> makes it easy for both the kernel and userspace to see the values
> (and set them after migration, if/when we extend this to virtio).

Right, thats an advantage, but:

- How can userspace ever synchronize with updates by the kernel 
  to the MSI-X entry?
- Reading/writing to the userspace area must be done carefully,
  values must be validated before used.
- Swapping issue (minor?).



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:48         ` Marcelo Tosatti
@ 2011-01-17 12:51           ` Avi Kivity
  2011-01-17 15:52             ` Marcelo Tosatti
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 12:51 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:
> On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> >  On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >  >>   >   +
> >  >>   >   +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >  >>   >   +		goto out;
> >  >>
> >  >>   Instead of copying to/from userspace (which is subject to swapin,
> >  >>   unexpected values), you could include the guest written value in a
> >  >>   kvm_run structure, along with address. Qemu-kvm would use that to
> >  >>   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >  >
> >  >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >  >the most condition. That's the cost we want to optimize. Also it's possible to
> >  >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >  >in order to prevent synchronize issue).
> >
> >  It's also good to have the values in just one place; using userspace
> >  makes it easy for both the kernel and userspace to see the values
> >  (and set them after migration, if/when we extend this to virtio).
>
> Right, thats an advantage, but:
>
> - How can userspace ever synchronize with updates by the kernel
>    to the MSI-X entry?

What a value is written by the guest, which kvm cannot handle itself 
(i.e. a change to anything other than the mask bit), we exit with the 
table and entry ids, so userspace can reread them.

> - Reading/writing to the userspace area must be done carefully,
>    values must be validated before used.

True every time...

> - Swapping issue (minor?).

I don't see the issue... just like any part of qemu that may be swapped 
out, blocking the vcpu thread.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
@ 2011-01-17 13:31     ` Michael S. Tsirkin
  2011-01-17 13:47     ` Jan Kiszka
  2011-01-30  4:38     ` Sheng Yang
  2 siblings, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2011-01-17 13:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >Then we can support mask bit operation of assigned devices now.
> >
> >
> >
> >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >+				int assigned_dev_id, int entry, bool mask)
> >+{
> >+	int r = -EFAULT;
> >+	struct kvm_assigned_dev_kernel *adev;
> >+	int i;
> >+
> >+	if (!irqchip_in_kernel(kvm))
> >+		return r;
> >+
> >+	mutex_lock(&kvm->lock);
> >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >+				      assigned_dev_id);
> >+	if (!adev)
> >+		goto out;
> >+
> >+	for (i = 0; i<  adev->entries_nr; i++)
> >+		if (adev->host_msix_entries[i].entry == entry) {
> >+			if (mask)
> >+				disable_irq_nosync(
> >+					adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> the mask bit on irq delivery, so we may forward an interrupt to the
> guest after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?

It is only guaranteed to take effect on the next read.
In practice, it takes effect earlier and guests
rely on this as an optimization.

> Another question is whether disable_irq_nosync() actually programs
> the device mask bit, or not.  If it does, then it's slow, and it may
> be better to leave interrupts enabled but have an internal pending
> bit.  If it doesn't program the mask bit, it's fine.
> 
> >+			else
> >+				enable_irq(adev->host_msix_entries[i].vector);
> >+			r = 0;
> >+			break;
> >+		}
> >+out:
> >+	mutex_unlock(&kvm->lock);
> >+	return r;
> >+}
> >
> >+
> >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >+				void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, ret = 0, entry, offset, r;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xf;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >+	if (r)
> >+		goto out;
> 
> and return ret == 0?
> 
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> >+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> >+				int len, const void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, entry, offset, ret = 0, r = 0;
> >+	gpa_t entry_base;
> >+	u32 old_ctrl, new_ctrl;
> >+	u32 *ctrl_pos;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xF;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> >+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> >+
> >+	if (get_user(old_ctrl, ctrl_pos))
> >+		goto out;
> >+
> >+	/* No allow writing to other fields when entry is unmasked */
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> >+		goto out;
> >+
> >+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >+		goto out;
> >+
> >+	if (get_user(new_ctrl, ctrl_pos))
> >+		goto out;
> 
> here, too.
> 
> >+
> >+	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> >+	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> >+		ret = -ENOTSYNC;
> 
> goto out?
> 
> >+	if (old_ctrl == new_ctrl)
> >+		goto out;
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> >+	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> >+	if (r || ret)
> >+		ret = -ENOTSYNC;
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> 
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
@ 2011-01-17 13:47     ` Jan Kiszka
  2011-01-30  4:38     ` Sheng Yang
  2 siblings, 0 replies; 29+ messages in thread
From: Jan Kiszka @ 2011-01-17 13:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, Michael S. Tsirkin, kvm

On 2011-01-17 13:29, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
>> Then we can support mask bit operation of assigned devices now.
>>
>>
>>
>> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
>> +                int assigned_dev_id, int entry, bool mask)
>> +{
>> +    int r = -EFAULT;
>> +    struct kvm_assigned_dev_kernel *adev;
>> +    int i;
>> +
>> +    if (!irqchip_in_kernel(kvm))
>> +        return r;
>> +
>> +    mutex_lock(&kvm->lock);
>> +    adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
>> +                      assigned_dev_id);
>> +    if (!adev)
>> +        goto out;
>> +
>> +    for (i = 0; i<  adev->entries_nr; i++)
>> +        if (adev->host_msix_entries[i].entry == entry) {
>> +            if (mask)
>> +                disable_irq_nosync(
>> +                    adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check the
> mask bit on irq delivery, so we may forward an interrupt to the guest
> after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?
> 
> Another question is whether disable_irq_nosync() actually programs the
> device mask bit, or not.  If it does, then it's slow, and it may be
> better to leave interrupts enabled but have an internal pending bit.  If
> it doesn't program the mask bit, it's fine.

disable_irq* works lazily, only disabling the line in software. If an
IRQ actually occurs, it is marked pending and the line is left masked on
handler return (that's one reason why masking via PCI config space is
slower at macro level).

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:51           ` Avi Kivity
@ 2011-01-17 15:52             ` Marcelo Tosatti
  2011-01-17 16:01               ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Marcelo Tosatti @ 2011-01-17 15:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On Mon, Jan 17, 2011 at 02:51:37PM +0200, Avi Kivity wrote:
> On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:
> >On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
> >>  On 01/17/2011 02:18 PM, Sheng Yang wrote:
> >>  >>   >   +
> >>  >>   >   +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >>  >>   >   +		goto out;
> >>  >>
> >>  >>   Instead of copying to/from userspace (which is subject to swapin,
> >>  >>   unexpected values), you could include the guest written value in a
> >>  >>   kvm_run structure, along with address. Qemu-kvm would use that to
> >>  >>   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
> >>  >
> >>  >We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
> >>  >the most condition. That's the cost we want to optimize. Also it's possible to
> >>  >userspace to read the correct value of MMIO(but mostly userspace can't write to it
> >>  >in order to prevent synchronize issue).
> >>
> >>  It's also good to have the values in just one place; using userspace
> >>  makes it easy for both the kernel and userspace to see the values
> >>  (and set them after migration, if/when we extend this to virtio).
> >
> >Right, thats an advantage, but:
> >
> >- How can userspace ever synchronize with updates by the kernel
> >   to the MSI-X entry?
> 
> What a value is written by the guest, which kvm cannot handle itself
> (i.e. a change to anything other than the mask bit), we exit with
> the table and entry ids, so userspace can reread them.

OK. But regarding access to the MSI-X entry in userspace, it can 
only be accessed safely wrt parallel updates by the kernel in the
exit handler.

Is the exit handler the only location where the MSI-X entry will be
read or written to, in userspace?

> >- Reading/writing to the userspace area must be done carefully,
> >   values must be validated before used.
> 
> True every time...
> 
> >- Swapping issue (minor?).
> 
> I don't see the issue... just like any part of qemu that may be
> swapped out, blocking the vcpu thread.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 15:52             ` Marcelo Tosatti
@ 2011-01-17 16:01               ` Avi Kivity
  0 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2011-01-17 16:01 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Sheng Yang, Michael S. Tsirkin, kvm

On 01/17/2011 05:52 PM, Marcelo Tosatti wrote:
> >
> >  What a value is written by the guest, which kvm cannot handle itself
> >  (i.e. a change to anything other than the mask bit), we exit with
> >  the table and entry ids, so userspace can reread them.
>
> OK. But regarding access to the MSI-X entry in userspace, it can
> only be accessed safely wrt parallel updates by the kernel in the
> exit handler.
>
> Is the exit handler the only location where the MSI-X entry will be
> read or written to, in userspace?
>

Yes.  This should be documented, that the kernel owns the msix area as 
far as writes are concerned.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-17 12:45       ` Avi Kivity
@ 2011-01-19  8:21         ` Sheng Yang
  2011-01-25 12:47           ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Sheng Yang @ 2011-01-19  8:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:45:55 Avi Kivity wrote:
> On 01/17/2011 02:35 PM, Sheng Yang wrote:
> > On Monday 17 January 2011 20:21:45 Avi Kivity wrote:
> > >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> > >  >  Signed-off-by: Sheng Yang<sheng@linux.intel.com>
> > >  >  ---
> > >  >  
> > >  >    Documentation/kvm/api.txt |   41
> > >  >    +++++++++++++++++++++++++++++++++++++++++ 1 files changed, 41
> > >  >    insertions(+), 0 deletions(-)
> > >  >  
> > >  >  diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> > >  >  index e1a9297..4978b94 100644
> > >  >  --- a/Documentation/kvm/api.txt
> > >  >  +++ b/Documentation/kvm/api.txt
> > >  >  @@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
> > >  >  
> > >  >    	__u16 padding[3];
> > >  >    
> > >  >    };
> > >  >  
> > >  >  +4.54 KVM_REGISTER_MSIX_MMIO
> > >  >  +
> > >  >  +Capability: KVM_CAP_MSIX_MMIO
> > >  >  +Architectures: x86
> > >  >  +Type: vm ioctl
> > >  >  +Parameters: struct kvm_msix_mmio_user (in)
> > >  >  +Returns: 0 on success, -1 on error
> > >  >  +
> > >  >  +This API indicates an MSI-X MMIO address of a guest device. Then
> > >  >  all MMIO +operation would be handled by kernel. When
> > >  >  necessary(e.g. MSI data/address +changed), KVM would exit to
> > >  >  userspace using
> > >  >  KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and
> > >  >  require userspace to update IRQ routing +table.
> > >  >  +
> > >  >  +struct kvm_msix_mmio_user {
> > >  >  +	__u32 dev_id;
> > >  >  +	__u16 type;		/* Device type and MMIO address type */
> > >  >  +	__u16 max_entries_nr;	/* Maximum entries supported */
> > >  >  +	__u64 base_addr;	/* Guest physical address of MMIO */
> > >  >  +	__u64 base_va;		/* Host virtual address of MMIO mapping */
> > >  >  +	__u64 flags;		/* Reserved for now */
> > >  >  +	__u64 reserved[4];
> > >  >  +};
> > >  >  +
> > >  >  +Current device type can be:
> > >  >  +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<   0)
> > >  >  +
> > >  >  +Current MMIO type can be:
> > >  >  +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<   8)
> > >  >  +
> > >  
> > >  How does userspace know which entry of which table changed?  Need a
> > >  field in struct kvm_run for that.
> > 
> > We already got an guest MMIO address for that in the exit information.
> > I've created a chain of handler in qemu to handle it.
> 
> But we already decoded the table and entry...

But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO. So it's 
not quite handy to get the table and entry out. Also the updater in the userspace 
can share the most logic with ordinary userspace MMIO handler, which take address 
as parameter. So I think we don't need to pass the decoded table_id and entry to 
userspace.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:39       ` Marcelo Tosatti
@ 2011-01-19  8:37         ` Sheng Yang
  0 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-19  8:37 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm

On Monday 17 January 2011 20:39:30 Marcelo Tosatti wrote:
> On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > +		goto out;
> > > 
> > > Instead of copying to/from userspace (which is subject to swapin,
> > > unexpected values), you could include the guest written value in a
> > > kvm_run structure, along with address. Qemu-kvm would use that to
> > > synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE
> > > exit.
> > 
> > We want to acelerate MSI-X mask bit accessing, which won't exit to
> > userspace in the most condition. That's the cost we want to optimize.
> > Also it's possible to userspace to read the correct value of MMIO(but
> > mostly userspace can't write to it in order to prevent synchronize
> > issue).
> 
> Yes, i meant exit to userspace only when necessary, but instead of
> copying directly everytime record the value the guest has written in
> kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.
> 
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> > > > +	if (old_ctrl == new_ctrl)
> > > > +		goto out;
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > 
> > > Then you rely on the kernel copy of the values to enable/disable irq.
> > 
> > Yes, they are guest owned assigned IRQs. Any potential issue?
> 
> Nothing prevents the kernel from enabling or disabling irq multiple
> times with this code (what prevents it is a qemu-kvm that behaves as
> expected). This is not very good.
> 
> Perhaps the guest can only harm itself with that, but i'm not sure.

MSI-X interrupts are not shared, so I think guest can only harm itself if it was 
doing it wrong.
> 
> And also if an msix table page is swapped out guest will hang.
> 
> > > > +	return r;
> > > > +}
> > > 
> > > This is not consistent with registration, where there are no checks
> > > regarding validity of assigned device id. So why is it necessary?
> > 
> > I am not quite understand. We need to free mmio anyway, otherwise it
> > would result in wrong mmio interception...
> 
> If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
> read/write paths, there is no need to free anything.

It may work with assigned devices, but the potential user of this is including vfio 
drivers and emulate devices. And I don't think it's a good idea to have 
registeration process but no free process...

--
regards
Yang, Sheng

> 
> > > There is a lock ordering problem BTW:
> > > 
> > > - read/write handlers: dev->lock -> kvm->lock
> > > - vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock ->
> > > dev->lock
> > 
> > Good catch! Would fix it(and other comments of course).
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-19  8:21         ` Sheng Yang
@ 2011-01-25 12:47           ` Avi Kivity
  2011-01-26  9:05             ` Sheng Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-25 12:47 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >  >
> >  >  We already got an guest MMIO address for that in the exit information.
> >  >  I've created a chain of handler in qemu to handle it.
> >
> >  But we already decoded the table and entry...
>
> But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO. So it's
> not quite handy to get the table and entry out.

The kernel handler can create a new kvm_run exit description.

>   Also the updater in the userspace
> can share the most logic with ordinary userspace MMIO handler, which take address
> as parameter. So I think we don't need to pass the decoded table_id and entry to
> userspace.

It's mixing layers, which always leads to trouble.  For one, the user 
handler shouldn't do anything with the write since the kernel already 
wrote it into the table.  For another, if two vcpus write to the same 
entry simultaneously, you could see different ordering in the kernel and 
userspace, and get inconsistent results.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-25 12:47           ` Avi Kivity
@ 2011-01-26  9:05             ` Sheng Yang
  2011-01-31 13:24               ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Sheng Yang @ 2011-01-26  9:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> On 01/19/2011 10:21 AM, Sheng Yang wrote:
> > >  >  We already got an guest MMIO address for that in the exit
> > >  >  information. I've created a chain of handler in qemu to handle it.
> > >  
> > >  But we already decoded the table and entry...
> > 
> > But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> > So it's not quite handy to get the table and entry out.
> 
> The kernel handler can create a new kvm_run exit description.
> 
> >   Also the updater in the userspace
> > 
> > can share the most logic with ordinary userspace MMIO handler, which take
> > address as parameter. So I think we don't need to pass the decoded
> > table_id and entry to userspace.
> 
> It's mixing layers, which always leads to trouble.  For one, the user
> handler shouldn't do anything with the write since the kernel already
> wrote it into the table.  For another, if two vcpus write to the same
> entry simultaneously, you could see different ordering in the kernel and
> userspace, and get inconsistent results.

The shared logic is not about writing, but about interpret what's written. Old 
MMIO handler would write the data, then interpret it; and our new MMIO would only 
share the logic of interpretation. I think that's fair enough?

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-17 12:29   ` Avi Kivity
  2011-01-17 13:31     ` Michael S. Tsirkin
  2011-01-17 13:47     ` Jan Kiszka
@ 2011-01-30  4:38     ` Sheng Yang
  2011-01-31 13:09       ` Avi Kivity
  2 siblings, 1 reply; 29+ messages in thread
From: Sheng Yang @ 2011-01-30  4:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

(Sorry, missed this mail...)

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >Then we can support mask bit operation of assigned devices now.
> >
> >
> >
> >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >+				int assigned_dev_id, int entry, bool mask)
> >+{
> >+	int r = -EFAULT;
> >+	struct kvm_assigned_dev_kernel *adev;
> >+	int i;
> >+
> >+	if (!irqchip_in_kernel(kvm))
> >+		return r;
> >+
> >+	mutex_lock(&kvm->lock);
> >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >+				      assigned_dev_id);
> >+	if (!adev)
> >+		goto out;
> >+
> >+	for (i = 0; i<  adev->entries_nr; i++)
> >+		if (adev->host_msix_entries[i].entry == entry) {
> >+			if (mask)
> >+				disable_irq_nosync(
> >+					adev->host_msix_entries[i].vector);
> 
> Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> the mask bit on irq delivery, so we may forward an interrupt to the
> guest after the mask bit was set.
> 
> What does pci say about the mask bit?  when does it take effect?
> 
> Another question is whether disable_irq_nosync() actually programs
> the device mask bit, or not.  If it does, then it's slow, and it may
> be better to leave interrupts enabled but have an internal pending
> bit.  If it doesn't program the mask bit, it's fine.

I think Michael and Jan had explained this.
> 
> >+			else
> >+				enable_irq(adev->host_msix_entries[i].vector);
> >+			r = 0;
> >+			break;
> >+		}
> >+out:
> >+	mutex_unlock(&kvm->lock);
> >+	return r;
> >+}
> >
> >+
> >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >+				void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, ret = 0, entry, offset, r;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xf;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >+	if (r)
> >+		goto out;
> 
> and return ret == 0?

Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
in order to omit this action. We can add warning to it later.
> 
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> >+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> >+				int len, const void *val)
> >+{
> >+	struct kvm_msix_mmio_dev *mmio_dev =
> >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >+	struct kvm_msix_mmio *mmio;
> >+	int idx, entry, offset, ret = 0, r = 0;
> >+	gpa_t entry_base;
> >+	u32 old_ctrl, new_ctrl;
> >+	u32 *ctrl_pos;
> >+
> >+	mutex_lock(&mmio_dev->lock);
> >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >+	if (idx<  0) {
> >+		ret = -EOPNOTSUPP;
> >+		goto out;
> >+	}
> >+	if ((addr&  0x3) || (len != 4&&  len != 8))
> >+		goto out;
> >+
> >+	offset = addr&  0xF;
> >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 8)
> >+		goto out;
> >+
> >+	mmio =&mmio_dev->mmio[idx];
> >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> >+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> >+
> >+	if (get_user(old_ctrl, ctrl_pos))
> >+		goto out;
> >+
> >+	/* No allow writing to other fields when entry is unmasked */
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> >+		goto out;
> >+
> >+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> >+		goto out;
> >+
> >+	if (get_user(new_ctrl, ctrl_pos))
> >+		goto out;
> 
> here, too.

The same as above.
> 
> >+
> >+	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> >+	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> >+		ret = -ENOTSYNC;
> 
> goto out?

No. This judgement only check if MSI data/address was touched. And the line
below would check if we need to operate mask bit. Because in theory guest can
use len=8 to modify MSI-X data and ctrl at the same time.

-- 
regards
Yang, Sheng
> 
> >+	if (old_ctrl == new_ctrl)
> >+		goto out;
> >+	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> >+	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> >+			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> >+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> >+	if (r || ret)
> >+		ret = -ENOTSYNC;
> >+out:
> >+	mutex_unlock(&mmio_dev->lock);
> >+	return ret;
> >+}
> >+
> 
> -- 
> error compiling committee.c: too many arguments to function
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  4:38     ` Sheng Yang
@ 2011-01-31 13:09       ` Avi Kivity
  2011-02-01  4:21         ` Sheng Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-31 13:09 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/30/2011 06:38 AM, Sheng Yang wrote:
> (Sorry, missed this mail...)
>
> On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> >  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >  >Then we can support mask bit operation of assigned devices now.
> >  >
> >  >
> >  >
> >  >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >  >+				int assigned_dev_id, int entry, bool mask)
> >  >+{
> >  >+	int r = -EFAULT;
> >  >+	struct kvm_assigned_dev_kernel *adev;
> >  >+	int i;
> >  >+
> >  >+	if (!irqchip_in_kernel(kvm))
> >  >+		return r;
> >  >+
> >  >+	mutex_lock(&kvm->lock);
> >  >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >  >+				      assigned_dev_id);
> >  >+	if (!adev)
> >  >+		goto out;
> >  >+
> >  >+	for (i = 0; i<   adev->entries_nr; i++)
> >  >+		if (adev->host_msix_entries[i].entry == entry) {
> >  >+			if (mask)
> >  >+				disable_irq_nosync(
> >  >+					adev->host_msix_entries[i].vector);
> >
> >  Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> >  the mask bit on irq delivery, so we may forward an interrupt to the
> >  guest after the mask bit was set.
> >
> >  What does pci say about the mask bit?  when does it take effect?
> >
> >  Another question is whether disable_irq_nosync() actually programs
> >  the device mask bit, or not.  If it does, then it's slow, and it may
> >  be better to leave interrupts enabled but have an internal pending
> >  bit.  If it doesn't program the mask bit, it's fine.
>
> I think Michael and Jan had explained this.
> >
> >  >+			else
> >  >+				enable_irq(adev->host_msix_entries[i].vector);
> >  >+			r = 0;
> >  >+			break;
> >  >+		}
> >  >+out:
> >  >+	mutex_unlock(&kvm->lock);
> >  >+	return r;
> >  >+}
> >  >
> >  >+
> >  >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  >+				void *val)
> >  >+{
> >  >+	struct kvm_msix_mmio_dev *mmio_dev =
> >  >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >  >+	struct kvm_msix_mmio *mmio;
> >  >+	int idx, ret = 0, entry, offset, r;
> >  >+
> >  >+	mutex_lock(&mmio_dev->lock);
> >  >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >  >+	if (idx<   0) {
> >  >+		ret = -EOPNOTSUPP;
> >  >+		goto out;
> >  >+	}
> >  >+	if ((addr&   0x3) || (len != 4&&   len != 8))
> >  >+		goto out;
> >  >+
> >  >+	offset = addr&   0xf;
> >  >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 8)
> >  >+		goto out;
> >  >+
> >  >+	mmio =&mmio_dev->mmio[idx];
> >  >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >  >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >  >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >  >+	if (r)
> >  >+		goto out;
> >
> >  and return ret == 0?
>
> Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
> in order to omit this action. We can add warning to it later.

But it failed.  We need to return -EFAULT.

> The same as above.
> >
> >  >+
> >  >+	if ((offset<   PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 4) ||
> >  >+	    (offset<   PCI_MSIX_ENTRY_DATA&&   len == 8))
> >  >+		ret = -ENOTSYNC;
> >
> >  goto out?
>
> No. This judgement only check if MSI data/address was touched. And the line
> below would check if we need to operate mask bit. Because in theory guest can
> use len=8 to modify MSI-X data and ctrl at the same time.
>

Ok, makes sense.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-26  9:05             ` Sheng Yang
@ 2011-01-31 13:24               ` Avi Kivity
  2011-02-01  4:26                 ` Sheng Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2011-01-31 13:24 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On 01/26/2011 11:05 AM, Sheng Yang wrote:
> On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> >  On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >  >  >   >   We already got an guest MMIO address for that in the exit
> >  >  >   >   information. I've created a chain of handler in qemu to handle it.
> >  >  >
> >  >  >   But we already decoded the table and entry...
> >  >
> >  >  But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> >  >  So it's not quite handy to get the table and entry out.
> >
> >  The kernel handler can create a new kvm_run exit description.
> >
> >  >    Also the updater in the userspace
> >  >
> >  >  can share the most logic with ordinary userspace MMIO handler, which take
> >  >  address as parameter. So I think we don't need to pass the decoded
> >  >  table_id and entry to userspace.
> >
> >  It's mixing layers, which always leads to trouble.  For one, the user
> >  handler shouldn't do anything with the write since the kernel already
> >  wrote it into the table.  For another, if two vcpus write to the same
> >  entry simultaneously, you could see different ordering in the kernel and
> >  userspace, and get inconsistent results.
>
> The shared logic is not about writing, but about interpret what's written. Old
> MMIO handler would write the data, then interpret it; and our new MMIO would only
> share the logic of interpretation. I think that's fair enough?

It dosn't make sense for an API point of view.  You registered a table 
of entries, you expect an exit on that table to point to the table and 
entry that got changed.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-31 13:09       ` Avi Kivity
@ 2011-02-01  4:21         ` Sheng Yang
  0 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-02-01  4:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Mon, Jan 31, 2011 at 03:09:09PM +0200, Avi Kivity wrote:
> On 01/30/2011 06:38 AM, Sheng Yang wrote:
> >(Sorry, missed this mail...)
> >
> >On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
> >>  On 01/06/2011 12:19 PM, Sheng Yang wrote:
> >>  >Then we can support mask bit operation of assigned devices now.
> >>  >
> >>  >
> >>  >
> >>  >+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> >>  >+				int assigned_dev_id, int entry, bool mask)
> >>  >+{
> >>  >+	int r = -EFAULT;
> >>  >+	struct kvm_assigned_dev_kernel *adev;
> >>  >+	int i;
> >>  >+
> >>  >+	if (!irqchip_in_kernel(kvm))
> >>  >+		return r;
> >>  >+
> >>  >+	mutex_lock(&kvm->lock);
> >>  >+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >>  >+				      assigned_dev_id);
> >>  >+	if (!adev)
> >>  >+		goto out;
> >>  >+
> >>  >+	for (i = 0; i<   adev->entries_nr; i++)
> >>  >+		if (adev->host_msix_entries[i].entry == entry) {
> >>  >+			if (mask)
> >>  >+				disable_irq_nosync(
> >>  >+					adev->host_msix_entries[i].vector);
> >>
> >>  Is it okay to call disable_irq_nosync() here?  IIRC we don't check
> >>  the mask bit on irq delivery, so we may forward an interrupt to the
> >>  guest after the mask bit was set.
> >>
> >>  What does pci say about the mask bit?  when does it take effect?
> >>
> >>  Another question is whether disable_irq_nosync() actually programs
> >>  the device mask bit, or not.  If it does, then it's slow, and it may
> >>  be better to leave interrupts enabled but have an internal pending
> >>  bit.  If it doesn't program the mask bit, it's fine.
> >
> >I think Michael and Jan had explained this.
> >>
> >>  >+			else
> >>  >+				enable_irq(adev->host_msix_entries[i].vector);
> >>  >+			r = 0;
> >>  >+			break;
> >>  >+		}
> >>  >+out:
> >>  >+	mutex_unlock(&kvm->lock);
> >>  >+	return r;
> >>  >+}
> >>  >
> >>  >+
> >>  >+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >>  >+				void *val)
> >>  >+{
> >>  >+	struct kvm_msix_mmio_dev *mmio_dev =
> >>  >+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> >>  >+	struct kvm_msix_mmio *mmio;
> >>  >+	int idx, ret = 0, entry, offset, r;
> >>  >+
> >>  >+	mutex_lock(&mmio_dev->lock);
> >>  >+	idx = get_mmio_table_index(mmio_dev, addr, len);
> >>  >+	if (idx<   0) {
> >>  >+		ret = -EOPNOTSUPP;
> >>  >+		goto out;
> >>  >+	}
> >>  >+	if ((addr&   0x3) || (len != 4&&   len != 8))
> >>  >+		goto out;
> >>  >+
> >>  >+	offset = addr&   0xf;
> >>  >+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 8)
> >>  >+		goto out;
> >>  >+
> >>  >+	mmio =&mmio_dev->mmio[idx];
> >>  >+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> >>  >+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> >>  >+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> >>  >+	if (r)
> >>  >+		goto out;
> >>
> >>  and return ret == 0?
> >
> >Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
> >in order to omit this action. We can add warning to it later.
> 
> But it failed.  We need to return -EFAULT.

So it would return to QEmu. OK, let QEmu prints warning about it.

-- 
regards
Yang, Sheng
> 
> >The same as above.
> >>
> >>  >+
> >>  >+	if ((offset<   PCI_MSIX_ENTRY_VECTOR_CTRL&&   len == 4) ||
> >>  >+	    (offset<   PCI_MSIX_ENTRY_DATA&&   len == 8))
> >>  >+		ret = -ENOTSYNC;
> >>
> >>  goto out?
> >
> >No. This judgement only check if MSI data/address was touched. And the line
> >below would check if we need to operate mask bit. Because in theory guest can
> >use len=8 to modify MSI-X data and ctrl at the same time.
> >
> 
> Ok, makes sense.
> 
> -- 
> error compiling committee.c: too many arguments to function
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-31 13:24               ` Avi Kivity
@ 2011-02-01  4:26                 ` Sheng Yang
  0 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-02-01  4:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Mon, Jan 31, 2011 at 03:24:27PM +0200, Avi Kivity wrote:
> On 01/26/2011 11:05 AM, Sheng Yang wrote:
> >On Tuesday 25 January 2011 20:47:38 Avi Kivity wrote:
> >>  On 01/19/2011 10:21 AM, Sheng Yang wrote:
> >>  >  >   >   We already got an guest MMIO address for that in the exit
> >>  >  >   >   information. I've created a chain of handler in qemu to handle it.
> >>  >  >
> >>  >  >   But we already decoded the table and entry...
> >>  >
> >>  >  But the handler is still wrapped by vcpu_mmio_write(), as a part of MMIO.
> >>  >  So it's not quite handy to get the table and entry out.
> >>
> >>  The kernel handler can create a new kvm_run exit description.
> >>
> >>  >    Also the updater in the userspace
> >>  >
> >>  >  can share the most logic with ordinary userspace MMIO handler, which take
> >>  >  address as parameter. So I think we don't need to pass the decoded
> >>  >  table_id and entry to userspace.
> >>
> >>  It's mixing layers, which always leads to trouble.  For one, the user
> >>  handler shouldn't do anything with the write since the kernel already
> >>  wrote it into the table.  For another, if two vcpus write to the same
> >>  entry simultaneously, you could see different ordering in the kernel and
> >>  userspace, and get inconsistent results.
> >
> >The shared logic is not about writing, but about interpret what's written. Old
> >MMIO handler would write the data, then interpret it; and our new MMIO would only
> >share the logic of interpretation. I think that's fair enough?
> 
> It dosn't make sense for an API point of view.  You registered a
> table of entries, you expect an exit on that table to point to the
> table and entry that got changed.

OK, I would update this when I come back to work(about two weeks later...).

-- 
regards
Yang, Sheng

> 
> -- 
> error compiling committee.c: too many arguments to function
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-30  5:11 [PATCH 0/3 v8] " Sheng Yang
@ 2011-01-30  5:11 ` Sheng Yang
  0 siblings, 0 replies; 29+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   47 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..e6b7a1d 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+NOTICE: Writing the MSI-X MMIO page after it was registered with this API may
+be dangerous for userspace program. The writing during VM running may result
+in synchronization issue therefore the assigned device can't work properly.
+The writing is allowed when VM is not running and can be used as save/restore
+mechanism.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2011-02-01  4:25 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-06 10:19 [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
2011-01-06 10:19 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-01-06 10:19 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-01-17 11:54   ` Marcelo Tosatti
2011-01-17 12:18     ` Sheng Yang
2011-01-17 12:18       ` Avi Kivity
2011-01-17 12:48         ` Marcelo Tosatti
2011-01-17 12:51           ` Avi Kivity
2011-01-17 15:52             ` Marcelo Tosatti
2011-01-17 16:01               ` Avi Kivity
2011-01-17 12:39       ` Marcelo Tosatti
2011-01-19  8:37         ` Sheng Yang
2011-01-17 12:29   ` Avi Kivity
2011-01-17 13:31     ` Michael S. Tsirkin
2011-01-17 13:47     ` Jan Kiszka
2011-01-30  4:38     ` Sheng Yang
2011-01-31 13:09       ` Avi Kivity
2011-02-01  4:21         ` Sheng Yang
2011-01-06 10:19 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
2011-01-17 12:21   ` Avi Kivity
2011-01-17 12:35     ` Sheng Yang
2011-01-17 12:45       ` Avi Kivity
2011-01-19  8:21         ` Sheng Yang
2011-01-25 12:47           ` Avi Kivity
2011-01-26  9:05             ` Sheng Yang
2011-01-31 13:24               ` Avi Kivity
2011-02-01  4:26                 ` Sheng Yang
2011-01-12  2:23 ` [PATCH 0/3 v7] MSI-X MMIO support for KVM Sheng Yang
2011-01-30  5:11 [PATCH 0/3 v8] " Sheng Yang
2011-01-30  5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.