kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
@ 2020-12-29 10:02 Elena Afanasova
  2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-29 10:02 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

This patchset introduces a KVM dispatch mechanism which can be used 
for handling MMIO/PIO accesses over file descriptors without returning 
from ioctl(KVM_RUN). This allows device emulation to run in another task 
separate from the vCPU task.

This is achieved through KVM vm ioctl for registering MMIO/PIO regions and 
a wire protocol that KVM uses to communicate with a task handling an 
MMIO/PIO access.

ioregionfd relies on kmemcg in order to limit the amount of kernel memory 
that userspace can consume. Can NR_IOBUS_DEVS hardcoded limit be enforced 
only in case kmemcg is disabled?

Elena Afanasova (2):
  KVM: add initial support for KVM_SET_IOREGION
  KVM: add initial support for ioregionfd blocking read/write operations

 arch/x86/kvm/Kconfig     |   1 +
 arch/x86/kvm/Makefile    |   1 +
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm_host.h |  17 ++
 include/uapi/linux/kvm.h |  23 +++
 virt/kvm/Kconfig         |   3 +
 virt/kvm/eventfd.c       |  25 +++
 virt/kvm/eventfd.h       |  14 ++
 virt/kvm/ioregion.c      | 390 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/ioregion.h      |  15 ++
 virt/kvm/kvm_main.c      |  20 +-
 11 files changed, 507 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/eventfd.h
 create mode 100644 virt/kvm/ioregion.c
 create mode 100644 virt/kvm/ioregion.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2020-12-29 10:02 [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
@ 2020-12-29 10:02 ` Elena Afanasova
  2020-12-29 11:36   ` Stefan Hajnoczi
  2020-12-31  3:45   ` Jason Wang
  2020-12-29 10:02 ` [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations Elena Afanasova
  2020-12-29 12:06 ` [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
  2 siblings, 2 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-29 10:02 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
read and write accesses are dispatched through the given ioregionfd
instead of returning from ioctl(KVM_RUN). Regions can be deleted by
setting fds to -1.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
 arch/x86/kvm/Kconfig     |   1 +
 arch/x86/kvm/Makefile    |   1 +
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm_host.h |  17 +++
 include/uapi/linux/kvm.h |  23 ++++
 virt/kvm/Kconfig         |   3 +
 virt/kvm/eventfd.c       |  25 +++++
 virt/kvm/eventfd.h       |  14 +++
 virt/kvm/ioregion.c      | 233 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/ioregion.h      |  15 +++
 virt/kvm/kvm_main.c      |  20 +++-
 11 files changed, 350 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/eventfd.h
 create mode 100644 virt/kvm/ioregion.c
 create mode 100644 virt/kvm/ioregion.h

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..b914ef375199 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -33,6 +33,7 @@ config KVM
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
+	select KVM_IOREGION
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..b3b17dc9f7d4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
 				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
+kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e545a8a613b1..ddb28f5ca252 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_USER_SPACE_MSR:
 	case KVM_CAP_X86_MSR_FILTER:
 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+	case KVM_CAP_IOREGIONFD:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..7cd667dddba9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,6 +470,10 @@ struct kvm {
 		struct mutex      resampler_lock;
 	} irqfds;
 	struct list_head ioeventfds;
+#endif
+#ifdef CONFIG_KVM_IOREGION
+	struct list_head ioregions_mmio;
+	struct list_head ioregions_pio;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
@@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
+#ifdef CONFIG_KVM_IOREGION
+void kvm_ioregionfd_init(struct kvm *kvm);
+int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
+
+#else
+
+static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
+static inline int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	return -ENOSYS;
+}
+#endif
+
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8..81e775778c66 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -732,6 +732,27 @@ struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+enum {
+	kvm_ioregion_flag_nr_pio,
+	kvm_ioregion_flag_nr_posted_writes,
+	kvm_ioregion_flag_nr_max,
+};
+
+#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
+#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
+
+#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
+
+struct kvm_ioregion {
+	__u64 guest_paddr; /* guest physical address */
+	__u64 memory_size; /* bytes */
+	__u64 user_data;
+	__s32 rfd;
+	__s32 wfd;
+	__u32 flags;
+	__u8  pad[28];
+};
+
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
@@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_IOREGIONFD 191
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..5e6620bbf000 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
 
+config KVM_IOREGION
+       bool
+
 config KVM_MMIO
        bool
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c2323c27a28b..aadb73903f8b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -27,6 +27,7 @@
 #include <trace/events/kvm.h>
 
 #include <kvm/iodev.h>
+#include "ioregion.h"
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 
@@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
 	.destructor = ioeventfd_destructor,
 };
 
+#ifdef CONFIG_KVM_IOREGION
+/* assumes kvm->slots_lock held */
+bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
+			  u64 start, u64 size)
+{
+	struct _ioeventfd *_p;
+
+	list_for_each_entry(_p, &kvm->ioeventfds, list)
+		if (_p->bus_idx == bus_idx &&
+		    overlap(start, size, _p->addr,
+			    !_p->length ? 8 : _p->length))
+			return true;
+
+	return false;
+}
+#endif
+
 /* assumes kvm->slots_lock held */
 static bool
 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
@@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 		       _p->datamatch == p->datamatch))))
 			return true;
 
+#ifdef CONFIG_KVM_IOREGION
+	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
+		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
+					  !p->length ? 8 : p->length))
+			return true;
+#endif
+
 	return false;
 }
 
diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
new file mode 100644
index 000000000000..73a621eebae3
--- /dev/null
+++ b/virt/kvm/eventfd.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_EVENTFD_H__
+#define __KVM_EVENTFD_H__
+
+#ifdef CONFIG_KVM_IOREGION
+bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
+#else
+static inline bool
+kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
+{
+	return false;
+}
+#endif
+#endif
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
new file mode 100644
index 000000000000..a200c3761343
--- /dev/null
+++ b/virt/kvm/ioregion.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <kvm/iodev.h>
+#include "eventfd.h"
+
+void
+kvm_ioregionfd_init(struct kvm *kvm)
+{
+	INIT_LIST_HEAD(&kvm->ioregions_mmio);
+	INIT_LIST_HEAD(&kvm->ioregions_pio);
+}
+
+struct ioregion {
+	struct list_head     list;
+	u64                  paddr;
+	u64                  size;
+	struct file         *rf;
+	struct file         *wf;
+	u64                  user_data;
+	struct kvm_io_device dev;
+	bool                 posted_writes;
+};
+
+static inline struct ioregion *
+to_ioregion(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct ioregion, dev);
+}
+
+/* assumes kvm->slots_lock held */
+static void
+ioregion_release(struct ioregion *p)
+{
+	fput(p->rf);
+	fput(p->wf);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static int
+ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+	      int len, void *val)
+{
+	return 0;
+}
+
+static int
+ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+		int len, const void *val)
+{
+	return 0;
+}
+
+/*
+ * This function is called as KVM is completely shutting down.  We do not
+ * need to worry about locking just nuke anything we have as quickly as possible
+ */
+static void
+ioregion_destructor(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	ioregion_release(p);
+}
+
+static const struct kvm_io_device_ops ioregion_ops = {
+	.read       = ioregion_read,
+	.write      = ioregion_write,
+	.destructor = ioregion_destructor,
+};
+
+static inline struct list_head *
+get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
+{
+	return (bus_idx == KVM_MMIO_BUS) ?
+		&kvm->ioregions_mmio : &kvm->ioregions_pio;
+}
+
+/* check for not overlapping case and reverse */
+inline bool
+overlap(u64 start1, u64 size1, u64 start2, u64 size2)
+{
+	u64 end1 = start1 + size1 - 1;
+	u64 end2 = start2 + size2 - 1;
+
+	return !(end1 < start2 || start1 >= end2);
+}
+
+/* assumes kvm->slots_lock held */
+bool
+kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
+		      u64 start, u64 size)
+{
+	struct ioregion *_p;
+	struct list_head *ioregions;
+
+	ioregions = get_ioregion_list(kvm, bus_idx);
+	list_for_each_entry(_p, ioregions, list)
+		if (overlap(start, size, _p->paddr, _p->size))
+			return true;
+
+	return false;
+}
+
+/* assumes kvm->slots_lock held */
+static bool
+ioregion_collision(struct kvm *kvm, struct ioregion *p, enum kvm_bus bus_idx)
+{
+	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
+	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
+		return true;
+
+	return false;
+}
+
+static enum kvm_bus
+get_bus_from_flags(__u32 flags)
+{
+	if (flags & KVM_IOREGION_PIO)
+		return KVM_PIO_BUS;
+	return KVM_MMIO_BUS;
+}
+
+int
+kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	struct ioregion *p;
+	bool is_posted_writes;
+	struct file *rfile, *wfile;
+	enum kvm_bus bus_idx;
+	int ret = 0;
+
+	if (!args->memory_size)
+		return -EINVAL;
+	if ((args->guest_paddr + args->memory_size - 1) < args->guest_paddr)
+		return -EINVAL;
+	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
+		return -EINVAL;
+
+	rfile = fget(args->rfd);
+	if (!rfile)
+		return -EBADF;
+	wfile = fget(args->wfd);
+	if (!wfile) {
+		fput(rfile);
+		return -EBADF;
+	}
+	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags & O_NONBLOCK)) {
+		ret = -EINVAL;
+		goto fail;
+	}
+	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->paddr = args->guest_paddr;
+	p->size = args->memory_size;
+	p->user_data = args->user_data;
+	p->rf = rfile;
+	p->wf = wfile;
+	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
+	p->posted_writes = is_posted_writes ? true : false;
+	bus_idx = get_bus_from_flags(args->flags);
+
+	mutex_lock(&kvm->slots_lock);
+
+	if (ioregion_collision(kvm, p, bus_idx)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+	kvm_iodevice_init(&p->dev, &ioregion_ops);
+	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
+				      &p->dev);
+	if (ret < 0)
+		goto unlock_fail;
+	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return 0;
+
+unlock_fail:
+	mutex_unlock(&kvm->slots_lock);
+	kfree(p);
+fail:
+	fput(rfile);
+	fput(wfile);
+
+	return ret;
+}
+
+static int
+kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	struct ioregion         *p, *tmp;
+	enum kvm_bus             bus_idx;
+	int                      ret = -ENOENT;
+	struct list_head        *ioregions;
+
+	if (args->rfd != -1 || args->wfd != -1)
+		return -EINVAL;
+
+	bus_idx = get_bus_from_flags(args->flags);
+	ioregions = get_ioregion_list(kvm, bus_idx);
+
+	mutex_lock(&kvm->slots_lock);
+
+	list_for_each_entry_safe(p, tmp, ioregions, list) {
+		if (p->paddr == args->guest_paddr  &&
+		    p->size == args->memory_size) {
+			kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+			ioregion_release(p);
+			ret = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+
+int
+kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	if (args->rfd == -1 || args->wfd == -1)
+		return kvm_rm_ioregion(kvm, args);
+	return kvm_set_ioregion(kvm, args);
+}
diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
new file mode 100644
index 000000000000..23ffa812ec7a
--- /dev/null
+++ b/virt/kvm/ioregion.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_IOREGION_H__
+#define __KVM_IOREGION_H__
+
+#ifdef CONFIG_KVM_IOREGION
+inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
+bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
+#else
+static inline bool
+kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
+{
+	return false;
+}
+#endif
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..385d8ec6350d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
+	kvm_ioregionfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
@@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
 		break;
 	}
+	case KVM_SET_IOREGION: {
+		struct kvm_ioregion data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof(data)))
+			goto out;
+
+		r = kvm_ioregionfd(kvm, &data);
+		break;
+	}
 	case KVM_GET_DIRTY_LOG: {
 		struct kvm_dirty_log log;
 
@@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (!bus)
 		return -ENOMEM;
 
-	/* exclude ioeventfd which is limited by maximum fd */
-	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
-		return -ENOSPC;
+	/* enforce hard limit if kmemcg is disabled and
+	 * exclude ioeventfd which is limited by maximum fd
+	 */
+	if (!memcg_kmem_enabled())
+		if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
+			return -ENOSPC;
 
 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
 			  GFP_KERNEL_ACCOUNT);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2020-12-29 10:02 [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
  2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
@ 2020-12-29 10:02 ` Elena Afanasova
  2020-12-29 12:00   ` Stefan Hajnoczi
  2020-12-31  3:46   ` Jason Wang
  2020-12-29 12:06 ` [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
  2 siblings, 2 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-29 10:02 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
 virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
index a200c3761343..8523f4126337 100644
--- a/virt/kvm/ioregion.c
+++ b/virt/kvm/ioregion.c
@@ -4,6 +4,33 @@
 #include <kvm/iodev.h>
 #include "eventfd.h"
 
+/* Wire protocol */
+struct ioregionfd_cmd {
+	__u32 info;
+	__u32 padding;
+	__u64 user_data;
+	__u64 offset;
+	__u64 data;
+};
+
+struct ioregionfd_resp {
+	__u64 data;
+	__u8 pad[24];
+};
+
+#define IOREGIONFD_CMD_READ    0
+#define IOREGIONFD_CMD_WRITE   1
+
+#define IOREGIONFD_SIZE_8BIT   0
+#define IOREGIONFD_SIZE_16BIT  1
+#define IOREGIONFD_SIZE_32BIT  2
+#define IOREGIONFD_SIZE_64BIT  3
+
+#define IOREGIONFD_SIZE_OFFSET 4
+#define IOREGIONFD_RESP_OFFSET 6
+#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
+#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
+
 void
 kvm_ioregionfd_init(struct kvm *kvm)
 {
@@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p)
 	kfree(p);
 }
 
+static bool
+pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, bool resp,
+	 u64 user_data, const void *val)
+{
+	u64 size = 0;
+
+	switch (len) {
+	case 1:
+		size = IOREGIONFD_SIZE_8BIT;
+		*((u8 *)&cmd->data) = val ? *(u8 *)val : 0;
+		break;
+	case 2:
+		size = IOREGIONFD_SIZE_16BIT;
+		*((u16 *)&cmd->data) = val ? *(u16 *)val : 0;
+		break;
+	case 4:
+		size = IOREGIONFD_SIZE_32BIT;
+		*((u32 *)&cmd->data) = val ? *(u32 *)val : 0;
+		break;
+	case 8:
+		size = IOREGIONFD_SIZE_64BIT;
+		*((u64 *)&cmd->data) = val ? *(u64 *)val : 0;
+		break;
+	default:
+		return false;
+	}
+	cmd->user_data = user_data;
+	cmd->offset = offset;
+	cmd->info |= opt;
+	cmd->info |= IOREGIONFD_SIZE(size);
+	cmd->info |= IOREGIONFD_RESP(resp);
+
+	return true;
+}
+
 static int
 ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	      int len, void *val)
 {
+	struct ioregion *p = to_ioregion(this);
+	struct ioregionfd_cmd *cmd;
+	struct ioregionfd_resp *resp;
+	size_t buf_size;
+	void *buf;
+	int ret = 0;
+
+	if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK))
+		return -EINVAL;
+	if ((addr + len - 1) > (p->paddr + p->size - 1))
+		return -EINVAL;
+
+	buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp));
+	buf = kzalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	cmd = (struct ioregionfd_cmd *)buf;
+	resp = (struct ioregionfd_resp *)buf;
+	if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
+		      1, p->user_data, NULL)) {
+		kfree(buf);
+		return -EOPNOTSUPP;
+	}
+
+	ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0);
+	if (ret != sizeof(*cmd)) {
+		kfree(buf);
+		return (ret < 0) ? ret : -EIO;
+	}
+	memset(buf, 0, buf_size);
+	ret = kernel_read(p->rf, resp, sizeof(*resp), 0);
+	if (ret != sizeof(*resp)) {
+		kfree(buf);
+		return (ret < 0) ? ret : -EIO;
+	}
+
+	switch (len) {
+	case 1:
+		*(u8 *)val = (u8)resp->data;
+		break;
+	case 2:
+		*(u16 *)val = (u16)resp->data;
+		break;
+	case 4:
+		*(u32 *)val = (u32)resp->data;
+		break;
+	case 8:
+		*(u64 *)val = (u64)resp->data;
+		break;
+	default:
+		break;
+	}
+
+	kfree(buf);
+
 	return 0;
 }
 
@@ -49,6 +166,46 @@ static int
 ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 		int len, const void *val)
 {
+	struct ioregion *p = to_ioregion(this);
+	struct ioregionfd_cmd *cmd;
+	struct ioregionfd_resp *resp;
+	size_t buf_size = 0;
+	void *buf;
+	int ret = 0;
+
+	if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK))
+		return -EINVAL;
+	if ((addr + len - 1) > (p->paddr + p->size - 1))
+		return -EINVAL;
+
+	buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp));
+	buf = kzalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	cmd = (struct ioregionfd_cmd *)buf;
+	if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE,
+		      p->posted_writes ? 0 : 1, p->user_data, val)) {
+		kfree(buf);
+		return -EOPNOTSUPP;
+	}
+
+	ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0);
+	if (ret != sizeof(*cmd)) {
+		kfree(buf);
+		return (ret < 0) ? ret : -EIO;
+	}
+
+	if (!p->posted_writes) {
+		memset(buf, 0, buf_size);
+		resp = (struct ioregionfd_resp *)buf;
+		ret = kernel_read(p->rf, resp, sizeof(*resp), 0);
+		if (ret != sizeof(*resp)) {
+			kfree(buf);
+			return (ret < 0) ? ret : -EIO;
+		}
+	}
+	kfree(buf);
+
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
@ 2020-12-29 11:36   ` Stefan Hajnoczi
  2020-12-30 12:14     ` Elena Afanasova
  2020-12-31  3:45   ` Jason Wang
  1 sibling, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2020-12-29 11:36 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 16894 bytes --]

On Tue, Dec 29, 2020 at 01:02:43PM +0300, Elena Afanasova wrote:
> This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> read and write accesses are dispatched through the given ioregionfd
> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> setting fds to -1.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
>  arch/x86/kvm/Kconfig     |   1 +
>  arch/x86/kvm/Makefile    |   1 +
>  arch/x86/kvm/x86.c       |   1 +
>  include/linux/kvm_host.h |  17 +++
>  include/uapi/linux/kvm.h |  23 ++++
>  virt/kvm/Kconfig         |   3 +
>  virt/kvm/eventfd.c       |  25 +++++
>  virt/kvm/eventfd.h       |  14 +++
>  virt/kvm/ioregion.c      | 233 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/ioregion.h      |  15 +++
>  virt/kvm/kvm_main.c      |  20 +++-
>  11 files changed, 350 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/eventfd.h
>  create mode 100644 virt/kvm/ioregion.c
>  create mode 100644 virt/kvm/ioregion.h
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index f92dfd8ef10d..b914ef375199 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -33,6 +33,7 @@ config KVM
>  	select HAVE_KVM_IRQ_BYPASS
>  	select HAVE_KVM_IRQ_ROUTING
>  	select HAVE_KVM_EVENTFD
> +	select KVM_IOREGION
>  	select KVM_ASYNC_PF
>  	select USER_RETURN_NOTIFIER
>  	select KVM_MMIO

TODO non-x86 arch support

> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index b804444e16d4..b3b17dc9f7d4 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>  kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>  				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>  
>  kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
>  			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e545a8a613b1..ddb28f5ca252 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_X86_USER_SPACE_MSR:
>  	case KVM_CAP_X86_MSR_FILTER:
>  	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> +	case KVM_CAP_IOREGIONFD:
>  		r = 1;
>  		break;
>  	case KVM_CAP_SYNC_REGS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7f2e2a09ebbd..7cd667dddba9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -470,6 +470,10 @@ struct kvm {
>  		struct mutex      resampler_lock;
>  	} irqfds;
>  	struct list_head ioeventfds;
> +#endif
> +#ifdef CONFIG_KVM_IOREGION
> +	struct list_head ioregions_mmio;
> +	struct list_head ioregions_pio;
>  #endif
>  	struct kvm_vm_stat stat;
>  	struct kvm_arch arch;
> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  
>  #endif /* CONFIG_HAVE_KVM_EVENTFD */
>  
> +#ifdef CONFIG_KVM_IOREGION
> +void kvm_ioregionfd_init(struct kvm *kvm);
> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> +
> +#else
> +
> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> +static inline int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	return -ENOSYS;
> +}
> +#endif
> +
>  void kvm_arch_irq_routing_update(struct kvm *kvm);
>  
>  static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ca41220b40b8..81e775778c66 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>  	__u8  pad[36];
>  };
>  
> +enum {
> +	kvm_ioregion_flag_nr_pio,
> +	kvm_ioregion_flag_nr_posted_writes,
> +	kvm_ioregion_flag_nr_max,
> +};
> +
> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
> +
> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
> +
> +struct kvm_ioregion {
> +	__u64 guest_paddr; /* guest physical address */
> +	__u64 memory_size; /* bytes */
> +	__u64 user_data;
> +	__s32 rfd;
> +	__s32 wfd;
> +	__u32 flags;
> +	__u8  pad[28];
> +};
> +
>  #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_X86_USER_SPACE_MSR 188
>  #define KVM_CAP_X86_MSR_FILTER 189
>  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> +#define KVM_CAP_IOREGIONFD 191
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>  					struct kvm_userspace_memory_region)
>  #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)
>  
>  /* enable ucontrol for s390 */
>  struct kvm_s390_ucas_mapping {
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 1c37ccd5d402..5e6620bbf000 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>         bool
>         select EVENTFD
>  
> +config KVM_IOREGION
> +       bool
> +
>  config KVM_MMIO
>         bool
>  
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index c2323c27a28b..aadb73903f8b 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -27,6 +27,7 @@
>  #include <trace/events/kvm.h>
>  
>  #include <kvm/iodev.h>
> +#include "ioregion.h"
>  
>  #ifdef CONFIG_HAVE_KVM_IRQFD
>  
> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
>  	.destructor = ioeventfd_destructor,
>  };
>  
> +#ifdef CONFIG_KVM_IOREGION
> +/* assumes kvm->slots_lock held */
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> +			  u64 start, u64 size)
> +{
> +	struct _ioeventfd *_p;
> +
> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> +		if (_p->bus_idx == bus_idx &&
> +		    overlap(start, size, _p->addr,
> +			    !_p->length ? 8 : _p->length))
> +			return true;
> +
> +	return false;
> +}
> +#endif
> +
>  /* assumes kvm->slots_lock held */
>  static bool
>  ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>  		       _p->datamatch == p->datamatch))))
>  			return true;
>  
> +#ifdef CONFIG_KVM_IOREGION
> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> +					  !p->length ? 8 : p->length))
> +			return true;
> +#endif
> +
>  	return false;
>  }
>  
> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> new file mode 100644
> index 000000000000..73a621eebae3
> --- /dev/null
> +++ b/virt/kvm/eventfd.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_EVENTFD_H__
> +#define __KVM_EVENTFD_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> new file mode 100644
> index 000000000000..a200c3761343
> --- /dev/null
> +++ b/virt/kvm/ioregion.c
> @@ -0,0 +1,233 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kvm_host.h>
> +#include <linux/fs.h>
> +#include <kvm/iodev.h>
> +#include "eventfd.h"
> +
> +void
> +kvm_ioregionfd_init(struct kvm *kvm)
> +{
> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> +}
> +
> +struct ioregion {

Please add comments describing the purpose of the fields, locking, etc.
For example, the list field is used with kvm->ioregions_mmio/pio. paddr
is a guest physical address. size is in bytes. wf is for writing struct
ioregion_cmd. rf is for reading struct ioregion_resp.

> +	struct list_head     list;
> +	u64                  paddr;
> +	u64                  size;
> +	struct file         *rf;
> +	struct file         *wf;
> +	u64                  user_data;
> +	struct kvm_io_device dev;
> +	bool                 posted_writes;

TODO implement posted_writes

> +};
> +
> +static inline struct ioregion *
> +to_ioregion(struct kvm_io_device *dev)
> +{
> +	return container_of(dev, struct ioregion, dev);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static void
> +ioregion_release(struct ioregion *p)
> +{
> +	fput(p->rf);
> +	fput(p->wf);
> +	list_del(&p->list);
> +	kfree(p);
> +}
> +
> +static int
> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +	      int len, void *val)
> +{
> +	return 0;
> +}
> +
> +static int
> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +		int len, const void *val)
> +{
> +	return 0;
> +}

The unimplemented ->read()/->write() should probably return errors.

> +
> +/*
> + * This function is called as KVM is completely shutting down.  We do not
> + * need to worry about locking just nuke anything we have as quickly as possible
> + */
> +static void
> +ioregion_destructor(struct kvm_io_device *this)
> +{
> +	struct ioregion *p = to_ioregion(this);
> +
> +	ioregion_release(p);
> +}
> +
> +static const struct kvm_io_device_ops ioregion_ops = {
> +	.read       = ioregion_read,
> +	.write      = ioregion_write,
> +	.destructor = ioregion_destructor,
> +};
> +
> +static inline struct list_head *
> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> +{
> +	return (bus_idx == KVM_MMIO_BUS) ?
> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> +}
> +
> +/* check for not overlapping case and reverse */
> +inline bool
> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> +{
> +	u64 end1 = start1 + size1 - 1;
> +	u64 end2 = start2 + size2 - 1;
> +
> +	return !(end1 < start2 || start1 >= end2);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> +		      u64 start, u64 size)
> +{
> +	struct ioregion *_p;
> +	struct list_head *ioregions;
> +
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +	list_for_each_entry(_p, ioregions, list)
> +		if (overlap(start, size, _p->paddr, _p->size))
> +			return true;
> +
> +	return false;
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static bool
> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum kvm_bus bus_idx)
> +{
> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> +		return true;
> +
> +	return false;
> +}
> +
> +static enum kvm_bus
> +get_bus_from_flags(__u32 flags)
> +{
> +	if (flags & KVM_IOREGION_PIO)
> +		return KVM_PIO_BUS;
> +	return KVM_MMIO_BUS;
> +}
> +
> +int
> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion *p;
> +	bool is_posted_writes;
> +	struct file *rfile, *wfile;
> +	enum kvm_bus bus_idx;
> +	int ret = 0;
> +
> +	if (!args->memory_size)
> +		return -EINVAL;
> +	if ((args->guest_paddr + args->memory_size - 1) < args->guest_paddr)
> +		return -EINVAL;
> +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> +		return -EINVAL;
> +
> +	rfile = fget(args->rfd);
> +	if (!rfile)
> +		return -EBADF;
> +	wfile = fget(args->wfd);
> +	if (!wfile) {
> +		fput(rfile);
> +		return -EBADF;
> +	}
> +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags & O_NONBLOCK)) {

This check prevents most user errors, but the userspace process can
still change the file descriptor flags later. Therefore the code needs
to be written to fail cleanly on -EAGAIN/-EWOULDBLOCK (no infinite loops
or crashes). It's worth noting this in a comment here so others reading
the code are aware of this constraint.

> +		ret = -EINVAL;
> +		goto fail;
> +	}
> +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	INIT_LIST_HEAD(&p->list);
> +	p->paddr = args->guest_paddr;
> +	p->size = args->memory_size;
> +	p->user_data = args->user_data;
> +	p->rf = rfile;
> +	p->wf = wfile;
> +	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> +	p->posted_writes = is_posted_writes ? true : false;
> +	bus_idx = get_bus_from_flags(args->flags);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	if (ioregion_collision(kvm, p, bus_idx)) {
> +		ret = -EEXIST;
> +		goto unlock_fail;
> +	}
> +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> +				      &p->dev);
> +	if (ret < 0)
> +		goto unlock_fail;
> +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return 0;
> +
> +unlock_fail:
> +	mutex_unlock(&kvm->slots_lock);
> +	kfree(p);
> +fail:
> +	fput(rfile);
> +	fput(wfile);
> +
> +	return ret;
> +}
> +
> +static int
> +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion         *p, *tmp;
> +	enum kvm_bus             bus_idx;
> +	int                      ret = -ENOENT;
> +	struct list_head        *ioregions;
> +
> +	if (args->rfd != -1 || args->wfd != -1)
> +		return -EINVAL;
> +
> +	bus_idx = get_bus_from_flags(args->flags);
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> +		if (p->paddr == args->guest_paddr  &&
> +		    p->size == args->memory_size) {
> +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> +			ioregion_release(p);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +
> +int
> +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{

The following check can be done here to make sure the ioctl always
returns EINVAL if unsupported flags are passed:

  if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
  	return -EINVAL;

(It's currently missing in kvm_rm_ioregion().)

> +	if (args->rfd == -1 || args->wfd == -1)
> +		return kvm_rm_ioregion(kvm, args);
> +	return kvm_set_ioregion(kvm, args);
> +}
> diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> new file mode 100644
> index 000000000000..23ffa812ec7a
> --- /dev/null
> +++ b/virt/kvm/ioregion.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_IOREGION_H__
> +#define __KVM_IOREGION_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2541a17ff1c4..385d8ec6350d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
>  	mmgrab(current->mm);
>  	kvm->mm = current->mm;
>  	kvm_eventfd_init(kvm);
> +	kvm_ioregionfd_init(kvm);
>  	mutex_init(&kvm->lock);
>  	mutex_init(&kvm->irq_lock);
>  	mutex_init(&kvm->slots_lock);
> @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
>  		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
>  		break;
>  	}
> +	case KVM_SET_IOREGION: {
> +		struct kvm_ioregion data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&data, argp, sizeof(data)))
> +			goto out;
> +
> +		r = kvm_ioregionfd(kvm, &data);
> +		break;
> +	}
>  	case KVM_GET_DIRTY_LOG: {
>  		struct kvm_dirty_log log;
>  
> @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  	if (!bus)
>  		return -ENOMEM;
>  
> -	/* exclude ioeventfd which is limited by maximum fd */
> -	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> -		return -ENOSPC;
> +	/* enforce hard limit if kmemcg is disabled and
> +	 * exclude ioeventfd which is limited by maximum fd
> +	 */
> +	if (!memcg_kmem_enabled())
> +		if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> +			return -ENOSPC;
>  
>  	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
>  			  GFP_KERNEL_ACCOUNT);

Please move this change to a separate patch.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2020-12-29 10:02 ` [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations Elena Afanasova
@ 2020-12-29 12:00   ` Stefan Hajnoczi
  2020-12-30 12:24     ` Elena Afanasova
  2020-12-31  3:46   ` Jason Wang
  1 sibling, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2020-12-29 12:00 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 4714 bytes --]

On Tue, Dec 29, 2020 at 01:02:44PM +0300, Elena Afanasova wrote:
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
>  virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 157 insertions(+)
> 
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index a200c3761343..8523f4126337 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -4,6 +4,33 @@
>  #include <kvm/iodev.h>
>  #include "eventfd.h"
>  
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +
> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};
> +
> +#define IOREGIONFD_CMD_READ    0
> +#define IOREGIONFD_CMD_WRITE   1
> +
> +#define IOREGIONFD_SIZE_8BIT   0
> +#define IOREGIONFD_SIZE_16BIT  1
> +#define IOREGIONFD_SIZE_32BIT  2
> +#define IOREGIONFD_SIZE_64BIT  3
> +
> +#define IOREGIONFD_SIZE_OFFSET 4
> +#define IOREGIONFD_RESP_OFFSET 6
> +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)

These belong in the uapi header so userspace also has struct
ioregionfd_cmd, struct ioregionfd_resp, etc.

> +
>  void
>  kvm_ioregionfd_init(struct kvm *kvm)
>  {
> @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p)
>  	kfree(p);
>  }
>  
> +static bool
> +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, bool resp,
> +	 u64 user_data, const void *val)
> +{
> +	u64 size = 0;
> +
> +	switch (len) {
> +	case 1:
> +		size = IOREGIONFD_SIZE_8BIT;
> +		*((u8 *)&cmd->data) = val ? *(u8 *)val : 0;
> +		break;
> +	case 2:
> +		size = IOREGIONFD_SIZE_16BIT;
> +		*((u16 *)&cmd->data) = val ? *(u16 *)val : 0;
> +		break;
> +	case 4:
> +		size = IOREGIONFD_SIZE_32BIT;
> +		*((u32 *)&cmd->data) = val ? *(u32 *)val : 0;
> +		break;
> +	case 8:
> +		size = IOREGIONFD_SIZE_64BIT;
> +		*((u64 *)&cmd->data) = val ? *(u64 *)val : 0;
> +		break;
> +	default:
> +		return false;
>

The assignments and casts can be replaced with a single memcpy after the
switch statement. This is also how KVM_EXIT_MMIO and Coalesced MMIO do
it:

  memcpy(cmd->data, val, len);

However, we need to make sure that cmd has been zeroed so that kernel
memory is not accidentally exposed to userspace.

 +	}
> +	cmd->user_data = user_data;
> +	cmd->offset = offset;
> +	cmd->info |= opt;
> +	cmd->info |= IOREGIONFD_SIZE(size);
> +	cmd->info |= IOREGIONFD_RESP(resp);
> +
> +	return true;
> +}
> +
>  static int
>  ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>  	      int len, void *val)
>  {
> +	struct ioregion *p = to_ioregion(this);
> +	struct ioregionfd_cmd *cmd;
> +	struct ioregionfd_resp *resp;
> +	size_t buf_size;
> +	void *buf;
> +	int ret = 0;
> +
> +	if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK))
> +		return -EINVAL;

Another CPU could change file descriptor flags while we are running.
Therefore it might be simplest to handle kernel_write() and
kernel_read() -EAGAIN return values instead of trying to check.

> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp));
> +	buf = kzalloc(buf_size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +	cmd = (struct ioregionfd_cmd *)buf;
> +	resp = (struct ioregionfd_resp *)buf;

I think they are small enough to declare them on the stack:

  union {
      struct ioregionfd_cmd cmd;
      struct ioregionfd_resp resp;
  } buf;

  memset(&buf, 0, sizeof(buf));

> +	if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
> +		      1, p->user_data, NULL)) {
> +		kfree(buf);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0);
> +	if (ret != sizeof(*cmd)) {
> +		kfree(buf);
> +		return (ret < 0) ? ret : -EIO;
> +	}
> +	memset(buf, 0, buf_size);
> +	ret = kernel_read(p->rf, resp, sizeof(*resp), 0);
> +	if (ret != sizeof(*resp)) {
> +		kfree(buf);
> +		return (ret < 0) ? ret : -EIO;
> +	}
> +
> +	switch (len) {
> +	case 1:
> +		*(u8 *)val = (u8)resp->data;
> +		break;
> +	case 2:
> +		*(u16 *)val = (u16)resp->data;
> +		break;
> +	case 4:
> +		*(u32 *)val = (u32)resp->data;
> +		break;
> +	case 8:
> +		*(u64 *)val = (u64)resp->data;
> +		break;
> +	default:
> +		break;
> +	}

This looks inconsistent. cmd->data is treated as a packed u8/u16/u32/864
whereas resp->data is treated as u64?

I was expecting memcpy(val, &resp->data, len) here not the u64 ->
u8/u16/u32/u64 conversion.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
  2020-12-29 10:02 [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
  2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
  2020-12-29 10:02 ` [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations Elena Afanasova
@ 2020-12-29 12:06 ` Stefan Hajnoczi
  2020-12-30 17:56   ` Elena Afanasova
  2 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2020-12-29 12:06 UTC (permalink / raw)
  To: Elena Afanasova
  Cc: kvm, jag.raman, elena.ufimtseva, Michael S. Tsirkin, jasowang

[-- Attachment #1: Type: text/plain, Size: 1723 bytes --]

On Tue, Dec 29, 2020 at 01:02:42PM +0300, Elena Afanasova wrote:
> This patchset introduces a KVM dispatch mechanism which can be used 
> for handling MMIO/PIO accesses over file descriptors without returning 
> from ioctl(KVM_RUN). This allows device emulation to run in another task 
> separate from the vCPU task.
> 
> This is achieved through KVM vm ioctl for registering MMIO/PIO regions and 
> a wire protocol that KVM uses to communicate with a task handling an 
> MMIO/PIO access.
> 
> ioregionfd relies on kmemcg in order to limit the amount of kernel memory 
> that userspace can consume. Can NR_IOBUS_DEVS hardcoded limit be enforced 
> only in case kmemcg is disabled?

Thanks for sharing this! Can you describe the todos? I noticed some in
Patch 1 and highlighted them. In addition:
 * Signal handling when the vCPU thread is interrupted in
   kernel_read()/kernel_write()

> Elena Afanasova (2):
>   KVM: add initial support for KVM_SET_IOREGION
>   KVM: add initial support for ioregionfd blocking read/write operations
> 
>  arch/x86/kvm/Kconfig     |   1 +
>  arch/x86/kvm/Makefile    |   1 +
>  arch/x86/kvm/x86.c       |   1 +
>  include/linux/kvm_host.h |  17 ++
>  include/uapi/linux/kvm.h |  23 +++
>  virt/kvm/Kconfig         |   3 +
>  virt/kvm/eventfd.c       |  25 +++
>  virt/kvm/eventfd.h       |  14 ++
>  virt/kvm/ioregion.c      | 390 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/ioregion.h      |  15 ++
>  virt/kvm/kvm_main.c      |  20 +-
>  11 files changed, 507 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/eventfd.h
>  create mode 100644 virt/kvm/ioregion.c
>  create mode 100644 virt/kvm/ioregion.h
> 
> -- 
> 2.25.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2020-12-29 11:36   ` Stefan Hajnoczi
@ 2020-12-30 12:14     ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-30 12:14 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kvm, jag.raman, elena.ufimtseva

On Tue, 2020-12-29 at 11:36 +0000, Stefan Hajnoczi wrote:
> On Tue, Dec 29, 2020 at 01:02:43PM +0300, Elena Afanasova wrote:
> > This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> > read and write accesses are dispatched through the given ioregionfd
> > instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> > setting fds to -1.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> >  arch/x86/kvm/Kconfig     |   1 +
> >  arch/x86/kvm/Makefile    |   1 +
> >  arch/x86/kvm/x86.c       |   1 +
> >  include/linux/kvm_host.h |  17 +++
> >  include/uapi/linux/kvm.h |  23 ++++
> >  virt/kvm/Kconfig         |   3 +
> >  virt/kvm/eventfd.c       |  25 +++++
> >  virt/kvm/eventfd.h       |  14 +++
> >  virt/kvm/ioregion.c      | 233
> > +++++++++++++++++++++++++++++++++++++++
> >  virt/kvm/ioregion.h      |  15 +++
> >  virt/kvm/kvm_main.c      |  20 +++-
> >  11 files changed, 350 insertions(+), 3 deletions(-)
> >  create mode 100644 virt/kvm/eventfd.h
> >  create mode 100644 virt/kvm/ioregion.c
> >  create mode 100644 virt/kvm/ioregion.h
> > 
> > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > index f92dfd8ef10d..b914ef375199 100644
> > --- a/arch/x86/kvm/Kconfig
> > +++ b/arch/x86/kvm/Kconfig
> > @@ -33,6 +33,7 @@ config KVM
> >  	select HAVE_KVM_IRQ_BYPASS
> >  	select HAVE_KVM_IRQ_ROUTING
> >  	select HAVE_KVM_EVENTFD
> > +	select KVM_IOREGION
> >  	select KVM_ASYNC_PF
> >  	select USER_RETURN_NOTIFIER
> >  	select KVM_MMIO
> 
> TODO non-x86 arch support
> 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index b804444e16d4..b3b17dc9f7d4 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> >  kvm-y			+= $(KVM)/kvm_main.o
> > $(KVM)/coalesced_mmio.o \
> >  				$(KVM)/eventfd.o $(KVM)/irqchip.o
> > $(KVM)/vfio.o
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> >  
> >  kvm-y			+= x86.o emulate.o i8259.o irq.o
> > lapic.o \
> >  			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
> > mtrr.o \
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e545a8a613b1..ddb28f5ca252 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> > *kvm, long ext)
> >  	case KVM_CAP_X86_USER_SPACE_MSR:
> >  	case KVM_CAP_X86_MSR_FILTER:
> >  	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > +	case KVM_CAP_IOREGIONFD:
> >  		r = 1;
> >  		break;
> >  	case KVM_CAP_SYNC_REGS:
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7f2e2a09ebbd..7cd667dddba9 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -470,6 +470,10 @@ struct kvm {
> >  		struct mutex      resampler_lock;
> >  	} irqfds;
> >  	struct list_head ioeventfds;
> > +#endif
> > +#ifdef CONFIG_KVM_IOREGION
> > +	struct list_head ioregions_mmio;
> > +	struct list_head ioregions_pio;
> >  #endif
> >  	struct kvm_vm_stat stat;
> >  	struct kvm_arch arch;
> > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
> > *kvm, struct kvm_ioeventfd *args)
> >  
> >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +void kvm_ioregionfd_init(struct kvm *kvm);
> > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> > +
> > +#else
> > +
> > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > kvm_ioregion *args)
> > +{
> > +	return -ENOSYS;
> > +}
> > +#endif
> > +
> >  void kvm_arch_irq_routing_update(struct kvm *kvm);
> >  
> >  static inline void kvm_make_request(int req, struct kvm_vcpu
> > *vcpu)
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index ca41220b40b8..81e775778c66 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> >  	__u8  pad[36];
> >  };
> >  
> > +enum {
> > +	kvm_ioregion_flag_nr_pio,
> > +	kvm_ioregion_flag_nr_posted_writes,
> > +	kvm_ioregion_flag_nr_max,
> > +};
> > +
> > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > kvm_ioregion_flag_nr_posted_writes)
> > +
> > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > kvm_ioregion_flag_nr_max) - 1)
> > +
> > +struct kvm_ioregion {
> > +	__u64 guest_paddr; /* guest physical address */
> > +	__u64 memory_size; /* bytes */
> > +	__u64 user_data;
> > +	__s32 rfd;
> > +	__s32 wfd;
> > +	__u32 flags;
> > +	__u8  pad[28];
> > +};
> > +
> >  #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> >  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> >  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_X86_USER_SPACE_MSR 188
> >  #define KVM_CAP_X86_MSR_FILTER 189
> >  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > +#define KVM_CAP_IOREGIONFD 191
> >  
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >  
> > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> >  					struct
> > kvm_userspace_memory_region)
> >  #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> >  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > kvm_ioregion)
> >  
> >  /* enable ucontrol for s390 */
> >  struct kvm_s390_ucas_mapping {
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 1c37ccd5d402..5e6620bbf000 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> >         bool
> >         select EVENTFD
> >  
> > +config KVM_IOREGION
> > +       bool
> > +
> >  config KVM_MMIO
> >         bool
> >  
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index c2323c27a28b..aadb73903f8b 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -27,6 +27,7 @@
> >  #include <trace/events/kvm.h>
> >  
> >  #include <kvm/iodev.h>
> > +#include "ioregion.h"
> >  
> >  #ifdef CONFIG_HAVE_KVM_IRQFD
> >  
> > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > ioeventfd_ops = {
> >  	.destructor = ioeventfd_destructor,
> >  };
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +/* assumes kvm->slots_lock held */
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > +			  u64 start, u64 size)
> > +{
> > +	struct _ioeventfd *_p;
> > +
> > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > +		if (_p->bus_idx == bus_idx &&
> > +		    overlap(start, size, _p->addr,
> > +			    !_p->length ? 8 : _p->length))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +#endif
> > +
> >  /* assumes kvm->slots_lock held */
> >  static bool
> >  ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > struct _ioeventfd *p)
> >  		       _p->datamatch == p->datamatch))))
> >  			return true;
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> > +					  !p->length ? 8 : p->length))
> > +			return true;
> > +#endif
> > +
> >  	return false;
> >  }
> >  
> > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > new file mode 100644
> > index 000000000000..73a621eebae3
> > --- /dev/null
> > +++ b/virt/kvm/eventfd.h
> > @@ -0,0 +1,14 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_EVENTFD_H__
> > +#define __KVM_EVENTFD_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > u64 size);
> > +#else
> > +static inline bool
> > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > new file mode 100644
> > index 000000000000..a200c3761343
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.c
> > @@ -0,0 +1,233 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/kvm_host.h>
> > +#include <linux/fs.h>
> > +#include <kvm/iodev.h>
> > +#include "eventfd.h"
> > +
> > +void
> > +kvm_ioregionfd_init(struct kvm *kvm)
> > +{
> > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > +}
> > +
> > +struct ioregion {
> 
> Please add comments describing the purpose of the fields, locking,
> etc.
> For example, the list field is used with kvm->ioregions_mmio/pio.
> paddr
> is a guest physical address. size is in bytes. wf is for writing
> struct
> ioregion_cmd. rf is for reading struct ioregion_resp.
> 
Ok

> > +	struct list_head     list;
> > +	u64                  paddr;
> > +	u64                  size;
> > +	struct file         *rf;
> > +	struct file         *wf;
> > +	u64                  user_data;
> > +	struct kvm_io_device dev;
> > +	bool                 posted_writes;
> 
> TODO implement posted_writes
> 
> > +};
> > +
> > +static inline struct ioregion *
> > +to_ioregion(struct kvm_io_device *dev)
> > +{
> > +	return container_of(dev, struct ioregion, dev);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static void
> > +ioregion_release(struct ioregion *p)
> > +{
> > +	fput(p->rf);
> > +	fput(p->wf);
> > +	list_del(&p->list);
> > +	kfree(p);
> > +}
> > +
> > +static int
> > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +	      int len, void *val)
> > +{
> > +	return 0;
> > +}
> > +
> > +static int
> > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +		int len, const void *val)
> > +{
> > +	return 0;
> > +}
> 
> The unimplemented ->read()/->write() should probably return errors.
> 
You're right, it would be better

> > +
> > +/*
> > + * This function is called as KVM is completely shutting down.  We
> > do not
> > + * need to worry about locking just nuke anything we have as
> > quickly as possible
> > + */
> > +static void
> > +ioregion_destructor(struct kvm_io_device *this)
> > +{
> > +	struct ioregion *p = to_ioregion(this);
> > +
> > +	ioregion_release(p);
> > +}
> > +
> > +static const struct kvm_io_device_ops ioregion_ops = {
> > +	.read       = ioregion_read,
> > +	.write      = ioregion_write,
> > +	.destructor = ioregion_destructor,
> > +};
> > +
> > +static inline struct list_head *
> > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > +{
> > +	return (bus_idx == KVM_MMIO_BUS) ?
> > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > +}
> > +
> > +/* check for not overlapping case and reverse */
> > +inline bool
> > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > +{
> > +	u64 end1 = start1 + size1 - 1;
> > +	u64 end2 = start2 + size2 - 1;
> > +
> > +	return !(end1 < start2 || start1 >= end2);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > +		      u64 start, u64 size)
> > +{
> > +	struct ioregion *_p;
> > +	struct list_head *ioregions;
> > +
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +	list_for_each_entry(_p, ioregions, list)
> > +		if (overlap(start, size, _p->paddr, _p->size))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static bool
> > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > kvm_bus bus_idx)
> > +{
> > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +static enum kvm_bus
> > +get_bus_from_flags(__u32 flags)
> > +{
> > +	if (flags & KVM_IOREGION_PIO)
> > +		return KVM_PIO_BUS;
> > +	return KVM_MMIO_BUS;
> > +}
> > +
> > +int
> > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion *p;
> > +	bool is_posted_writes;
> > +	struct file *rfile, *wfile;
> > +	enum kvm_bus bus_idx;
> > +	int ret = 0;
> > +
> > +	if (!args->memory_size)
> > +		return -EINVAL;
> > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > >guest_paddr)
> > +		return -EINVAL;
> > +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> > +		return -EINVAL;
> > +
> > +	rfile = fget(args->rfd);
> > +	if (!rfile)
> > +		return -EBADF;
> > +	wfile = fget(args->wfd);
> > +	if (!wfile) {
> > +		fput(rfile);
> > +		return -EBADF;
> > +	}
> > +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
> > O_NONBLOCK)) {
> 
> This check prevents most user errors, but the userspace process can
> still change the file descriptor flags later. Therefore the code
> needs
> to be written to fail cleanly on -EAGAIN/-EWOULDBLOCK (no infinite
> loops
> or crashes). It's worth noting this in a comment here so others
> reading
> the code are aware of this constraint.
> 
Ok, I'll fix this, thanks!

> > +		ret = -EINVAL;
> > +		goto fail;
> > +	}
> > +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> > +	if (!p) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&p->list);
> > +	p->paddr = args->guest_paddr;
> > +	p->size = args->memory_size;
> > +	p->user_data = args->user_data;
> > +	p->rf = rfile;
> > +	p->wf = wfile;
> > +	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> > +	p->posted_writes = is_posted_writes ? true : false;
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	if (ioregion_collision(kvm, p, bus_idx)) {
> > +		ret = -EEXIST;
> > +		goto unlock_fail;
> > +	}
> > +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> > +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> > +				      &p->dev);
> > +	if (ret < 0)
> > +		goto unlock_fail;
> > +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return 0;
> > +
> > +unlock_fail:
> > +	mutex_unlock(&kvm->slots_lock);
> > +	kfree(p);
> > +fail:
> > +	fput(rfile);
> > +	fput(wfile);
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion         *p, *tmp;
> > +	enum kvm_bus             bus_idx;
> > +	int                      ret = -ENOENT;
> > +	struct list_head        *ioregions;
> > +
> > +	if (args->rfd != -1 || args->wfd != -1)
> > +		return -EINVAL;
> > +
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> > +		if (p->paddr == args->guest_paddr  &&
> > +		    p->size == args->memory_size) {
> > +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p-
> > >dev);
> > +			ioregion_release(p);
> > +			ret = 0;
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +int
> > +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> 
> The following check can be done here to make sure the ioctl always
> returns EINVAL if unsupported flags are passed:
> 
>   if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
>   	return -EINVAL;
> 
> (It's currently missing in kvm_rm_ioregion().)
> 
Ok

> > +	if (args->rfd == -1 || args->wfd == -1)
> > +		return kvm_rm_ioregion(kvm, args);
> > +	return kvm_set_ioregion(kvm, args);
> > +}
> > diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> > new file mode 100644
> > index 000000000000..23ffa812ec7a
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_IOREGION_H__
> > +#define __KVM_IOREGION_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> > +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
> > start, u64 size);
> > +#else
> > +static inline bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 2541a17ff1c4..385d8ec6350d 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long
> > type)
> >  	mmgrab(current->mm);
> >  	kvm->mm = current->mm;
> >  	kvm_eventfd_init(kvm);
> > +	kvm_ioregionfd_init(kvm);
> >  	mutex_init(&kvm->lock);
> >  	mutex_init(&kvm->irq_lock);
> >  	mutex_init(&kvm->slots_lock);
> > @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
> >  		r = kvm_vm_ioctl_set_memory_region(kvm,
> > &kvm_userspace_mem);
> >  		break;
> >  	}
> > +	case KVM_SET_IOREGION: {
> > +		struct kvm_ioregion data;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&data, argp, sizeof(data)))
> > +			goto out;
> > +
> > +		r = kvm_ioregionfd(kvm, &data);
> > +		break;
> > +	}
> >  	case KVM_GET_DIRTY_LOG: {
> >  		struct kvm_dirty_log log;
> >  
> > @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm,
> > enum kvm_bus bus_idx, gpa_t addr,
> >  	if (!bus)
> >  		return -ENOMEM;
> >  
> > -	/* exclude ioeventfd which is limited by maximum fd */
> > -	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> > -		return -ENOSPC;
> > +	/* enforce hard limit if kmemcg is disabled and
> > +	 * exclude ioeventfd which is limited by maximum fd
> > +	 */
> > +	if (!memcg_kmem_enabled())
> > +		if (bus->dev_count - bus->ioeventfd_count >
> > NR_IOBUS_DEVS - 1)
> > +			return -ENOSPC;
> >  
> >  	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
> >  			  GFP_KERNEL_ACCOUNT);
> 
> Please move this change to a separate patch.
Ok


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2020-12-29 12:00   ` Stefan Hajnoczi
@ 2020-12-30 12:24     ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-30 12:24 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kvm, jag.raman, elena.ufimtseva

On Tue, 2020-12-29 at 12:00 +0000, Stefan Hajnoczi wrote:
> On Tue, Dec 29, 2020 at 01:02:44PM +0300, Elena Afanasova wrote:
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> >  virt/kvm/ioregion.c | 157
> > ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 157 insertions(+)
> > 
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index a200c3761343..8523f4126337 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -4,6 +4,33 @@
> >  #include <kvm/iodev.h>
> >  #include "eventfd.h"
> >  
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};
> > +
> > +#define IOREGIONFD_CMD_READ    0
> > +#define IOREGIONFD_CMD_WRITE   1
> > +
> > +#define IOREGIONFD_SIZE_8BIT   0
> > +#define IOREGIONFD_SIZE_16BIT  1
> > +#define IOREGIONFD_SIZE_32BIT  2
> > +#define IOREGIONFD_SIZE_64BIT  3
> > +
> > +#define IOREGIONFD_SIZE_OFFSET 4
> > +#define IOREGIONFD_RESP_OFFSET 6
> > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
> 
> These belong in the uapi header so userspace also has struct
> ioregionfd_cmd, struct ioregionfd_resp, etc.
> 
I'll move the wire protocol to a separate uapi header

> > +
> >  void
> >  kvm_ioregionfd_init(struct kvm *kvm)
> >  {
> > @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p)
> >  	kfree(p);
> >  }
> >  
> > +static bool
> > +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt,
> > bool resp,
> > +	 u64 user_data, const void *val)
> > +{
> > +	u64 size = 0;
> > +
> > +	switch (len) {
> > +	case 1:
> > +		size = IOREGIONFD_SIZE_8BIT;
> > +		*((u8 *)&cmd->data) = val ? *(u8 *)val : 0;
> > +		break;
> > +	case 2:
> > +		size = IOREGIONFD_SIZE_16BIT;
> > +		*((u16 *)&cmd->data) = val ? *(u16 *)val : 0;
> > +		break;
> > +	case 4:
> > +		size = IOREGIONFD_SIZE_32BIT;
> > +		*((u32 *)&cmd->data) = val ? *(u32 *)val : 0;
> > +		break;
> > +	case 8:
> > +		size = IOREGIONFD_SIZE_64BIT;
> > +		*((u64 *)&cmd->data) = val ? *(u64 *)val : 0;
> > +		break;
> > +	default:
> > +		return false;
> > 
> 
> The assignments and casts can be replaced with a single memcpy after
> the
> switch statement. This is also how KVM_EXIT_MMIO and Coalesced MMIO
> do
> it:
> 
>   memcpy(cmd->data, val, len);
> 
Thanks for pointing it out

> However, we need to make sure that cmd has been zeroed so that kernel
> memory is not accidentally exposed to userspace.
> 
>  +	}
> > +	cmd->user_data = user_data;
> > +	cmd->offset = offset;
> > +	cmd->info |= opt;
> > +	cmd->info |= IOREGIONFD_SIZE(size);
> > +	cmd->info |= IOREGIONFD_RESP(resp);
> > +
> > +	return true;
> > +}
> > +
> >  static int
> >  ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> >  	      int len, void *val)
> >  {
> > +	struct ioregion *p = to_ioregion(this);
> > +	struct ioregionfd_cmd *cmd;
> > +	struct ioregionfd_resp *resp;
> > +	size_t buf_size;
> > +	void *buf;
> > +	int ret = 0;
> > +
> > +	if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags &
> > O_NONBLOCK))
> > +		return -EINVAL;
> 
> Another CPU could change file descriptor flags while we are running.
> Therefore it might be simplest to handle kernel_write() and
> kernel_read() -EAGAIN return values instead of trying to check.
> 
Ok, I'll fix this

> > +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> > +		return -EINVAL;
> > +
> > +	buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp));
> > +	buf = kzalloc(buf_size, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +	cmd = (struct ioregionfd_cmd *)buf;
> > +	resp = (struct ioregionfd_resp *)buf;
> 
> I think they are small enough to declare them on the stack:
> 
>   union {
>       struct ioregionfd_cmd cmd;
>       struct ioregionfd_resp resp;
>   } buf;
> 
>   memset(&buf, 0, sizeof(buf));
> 
Ok

> > +	if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
> > +		      1, p->user_data, NULL)) {
> > +		kfree(buf);
> > +		return -EOPNOTSUPP;
> > +	}
> > +
> > +	ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0);
> > +	if (ret != sizeof(*cmd)) {
> > +		kfree(buf);
> > +		return (ret < 0) ? ret : -EIO;
> > +	}
> > +	memset(buf, 0, buf_size);
> > +	ret = kernel_read(p->rf, resp, sizeof(*resp), 0);
> > +	if (ret != sizeof(*resp)) {
> > +		kfree(buf);
> > +		return (ret < 0) ? ret : -EIO;
> > +	}
> > +
> > +	switch (len) {
> > +	case 1:
> > +		*(u8 *)val = (u8)resp->data;
> > +		break;
> > +	case 2:
> > +		*(u16 *)val = (u16)resp->data;
> > +		break;
> > +	case 4:
> > +		*(u32 *)val = (u32)resp->data;
> > +		break;
> > +	case 8:
> > +		*(u64 *)val = (u64)resp->data;
> > +		break;
> > +	default:
> > +		break;
> > +	}
> 
> This looks inconsistent. cmd->data is treated as a packed
> u8/u16/u32/864
> whereas resp->data is treated as u64?
> 
> I was expecting memcpy(val, &resp->data, len) here not the u64 ->
> u8/u16/u32/u64 conversion.
I'll fix this, thanks!



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
  2020-12-29 12:06 ` [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
@ 2020-12-30 17:56   ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2020-12-30 17:56 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kvm, jag.raman, elena.ufimtseva, Michael S. Tsirkin, jasowang

On Tue, 2020-12-29 at 12:06 +0000, Stefan Hajnoczi wrote:
> On Tue, Dec 29, 2020 at 01:02:42PM +0300, Elena Afanasova wrote:
> > This patchset introduces a KVM dispatch mechanism which can be
> > used 
> > for handling MMIO/PIO accesses over file descriptors without
> > returning 
> > from ioctl(KVM_RUN). This allows device emulation to run in another
> > task 
> > separate from the vCPU task.
> > 
> > This is achieved through KVM vm ioctl for registering MMIO/PIO
> > regions and 
> > a wire protocol that KVM uses to communicate with a task handling
> > an 
> > MMIO/PIO access.
> > 
> > ioregionfd relies on kmemcg in order to limit the amount of kernel
> > memory 
> > that userspace can consume. Can NR_IOBUS_DEVS hardcoded limit be
> > enforced 
> > only in case kmemcg is disabled?
> 
> Thanks for sharing this! Can you describe the todos? I noticed some
> in
> Patch 1 and highlighted them. In addition:
>  * Signal handling when the vCPU thread is interrupted in
>    kernel_read()/kernel_write()
> 
TODOs:

* Signal handling when the vCPU thread is interrupted in
   kernel_read()/kernel_write()
* Add ioregionfd cmds/replies serialization
* Implement KVM_EXIT_IOREGIONFD_FAILURE
* Add non-x86 arch support
* Add kvm-unittests

> > Elena Afanasova (2):
> >   KVM: add initial support for KVM_SET_IOREGION
> >   KVM: add initial support for ioregionfd blocking read/write
> > operations
> > 
> >  arch/x86/kvm/Kconfig     |   1 +
> >  arch/x86/kvm/Makefile    |   1 +
> >  arch/x86/kvm/x86.c       |   1 +
> >  include/linux/kvm_host.h |  17 ++
> >  include/uapi/linux/kvm.h |  23 +++
> >  virt/kvm/Kconfig         |   3 +
> >  virt/kvm/eventfd.c       |  25 +++
> >  virt/kvm/eventfd.h       |  14 ++
> >  virt/kvm/ioregion.c      | 390
> > +++++++++++++++++++++++++++++++++++++++
> >  virt/kvm/ioregion.h      |  15 ++
> >  virt/kvm/kvm_main.c      |  20 +-
> >  11 files changed, 507 insertions(+), 3 deletions(-)
> >  create mode 100644 virt/kvm/eventfd.h
> >  create mode 100644 virt/kvm/ioregion.c
> >  create mode 100644 virt/kvm/ioregion.h
> > 
> > -- 
> > 2.25.1
> > 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
  2020-12-29 11:36   ` Stefan Hajnoczi
@ 2020-12-31  3:45   ` Jason Wang
  2021-01-03 20:32     ` Elena Afanasova
  1 sibling, 1 reply; 28+ messages in thread
From: Jason Wang @ 2020-12-31  3:45 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2020/12/29 下午6:02, Elena Afanasova wrote:
> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.


How about FAST_MMIO?


> Guest
> read and write accesses are dispatched through the given ioregionfd
> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> setting fds to -1.
>
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
>   arch/x86/kvm/Kconfig     |   1 +
>   arch/x86/kvm/Makefile    |   1 +
>   arch/x86/kvm/x86.c       |   1 +
>   include/linux/kvm_host.h |  17 +++
>   include/uapi/linux/kvm.h |  23 ++++
>   virt/kvm/Kconfig         |   3 +
>   virt/kvm/eventfd.c       |  25 +++++
>   virt/kvm/eventfd.h       |  14 +++
>   virt/kvm/ioregion.c      | 233 +++++++++++++++++++++++++++++++++++++++
>   virt/kvm/ioregion.h      |  15 +++
>   virt/kvm/kvm_main.c      |  20 +++-
>   11 files changed, 350 insertions(+), 3 deletions(-)
>   create mode 100644 virt/kvm/eventfd.h
>   create mode 100644 virt/kvm/ioregion.c
>   create mode 100644 virt/kvm/ioregion.h
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index f92dfd8ef10d..b914ef375199 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -33,6 +33,7 @@ config KVM
>   	select HAVE_KVM_IRQ_BYPASS
>   	select HAVE_KVM_IRQ_ROUTING
>   	select HAVE_KVM_EVENTFD
> +	select KVM_IOREGION
>   	select KVM_ASYNC_PF
>   	select USER_RETURN_NOTIFIER
>   	select KVM_MMIO
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index b804444e16d4..b3b17dc9f7d4 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>   kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>   				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
>   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>   
>   kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
>   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e545a8a613b1..ddb28f5ca252 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   	case KVM_CAP_X86_USER_SPACE_MSR:
>   	case KVM_CAP_X86_MSR_FILTER:
>   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> +	case KVM_CAP_IOREGIONFD:
>   		r = 1;
>   		break;
>   	case KVM_CAP_SYNC_REGS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7f2e2a09ebbd..7cd667dddba9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -470,6 +470,10 @@ struct kvm {
>   		struct mutex      resampler_lock;
>   	} irqfds;
>   	struct list_head ioeventfds;
> +#endif
> +#ifdef CONFIG_KVM_IOREGION
> +	struct list_head ioregions_mmio;
> +	struct list_head ioregions_pio;
>   #endif
>   	struct kvm_vm_stat stat;
>   	struct kvm_arch arch;
> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   
>   #endif /* CONFIG_HAVE_KVM_EVENTFD */
>   
> +#ifdef CONFIG_KVM_IOREGION
> +void kvm_ioregionfd_init(struct kvm *kvm);
> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> +
> +#else
> +
> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> +static inline int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	return -ENOSYS;
> +}
> +#endif
> +
>   void kvm_arch_irq_routing_update(struct kvm *kvm);
>   
>   static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ca41220b40b8..81e775778c66 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>   
> +enum {
> +	kvm_ioregion_flag_nr_pio,
> +	kvm_ioregion_flag_nr_posted_writes,
> +	kvm_ioregion_flag_nr_max,
> +};
> +
> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
> +
> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
> +
> +struct kvm_ioregion {
> +	__u64 guest_paddr; /* guest physical address */
> +	__u64 memory_size; /* bytes */
> +	__u64 user_data;


What will this field do? Is it a token?


> +	__s32 rfd;
> +	__s32 wfd;
> +	__u32 flags;
> +	__u8  pad[28];
> +};


Is this possible to register the same fd with multiple GPA ranges? If 
not, do we need to check for fd collision?


> +
>   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>   #define KVM_CAP_X86_USER_SPACE_MSR 188
>   #define KVM_CAP_X86_MSR_FILTER 189
>   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> +#define KVM_CAP_IOREGIONFD 191
>   
>   #ifdef KVM_CAP_IRQ_ROUTING
>   
> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>   					struct kvm_userspace_memory_region)
>   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)
>   
>   /* enable ucontrol for s390 */
>   struct kvm_s390_ucas_mapping {
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 1c37ccd5d402..5e6620bbf000 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>          bool
>          select EVENTFD
>   
> +config KVM_IOREGION
> +       bool
> +
>   config KVM_MMIO
>          bool
>   
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index c2323c27a28b..aadb73903f8b 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -27,6 +27,7 @@
>   #include <trace/events/kvm.h>
>   
>   #include <kvm/iodev.h>
> +#include "ioregion.h"
>   
>   #ifdef CONFIG_HAVE_KVM_IRQFD
>   
> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
>   	.destructor = ioeventfd_destructor,
>   };
>   
> +#ifdef CONFIG_KVM_IOREGION
> +/* assumes kvm->slots_lock held */
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> +			  u64 start, u64 size)
> +{
> +	struct _ioeventfd *_p;
> +
> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> +		if (_p->bus_idx == bus_idx &&
> +		    overlap(start, size, _p->addr,
> +			    !_p->length ? 8 : _p->length))
> +			return true;
> +
> +	return false;
> +}
> +#endif
> +
>   /* assumes kvm->slots_lock held */
>   static bool
>   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>   		       _p->datamatch == p->datamatch))))
>   			return true;
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> +					  !p->length ? 8 : p->length))
> +			return true;
> +#endif
> +
>   	return false;
>   }
>   
> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> new file mode 100644
> index 000000000000..73a621eebae3
> --- /dev/null
> +++ b/virt/kvm/eventfd.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_EVENTFD_H__
> +#define __KVM_EVENTFD_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> new file mode 100644
> index 000000000000..a200c3761343
> --- /dev/null
> +++ b/virt/kvm/ioregion.c
> @@ -0,0 +1,233 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kvm_host.h>
> +#include <linux/fs.h>
> +#include <kvm/iodev.h>
> +#include "eventfd.h"
> +
> +void
> +kvm_ioregionfd_init(struct kvm *kvm)
> +{
> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> +}
> +
> +struct ioregion {
> +	struct list_head     list;
> +	u64                  paddr;
> +	u64                  size;
> +	struct file         *rf;
> +	struct file         *wf;
> +	u64                  user_data;
> +	struct kvm_io_device dev;
> +	bool                 posted_writes;
> +};
> +
> +static inline struct ioregion *
> +to_ioregion(struct kvm_io_device *dev)
> +{
> +	return container_of(dev, struct ioregion, dev);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static void
> +ioregion_release(struct ioregion *p)
> +{
> +	fput(p->rf);
> +	fput(p->wf);
> +	list_del(&p->list);
> +	kfree(p);
> +}
> +
> +static int
> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +	      int len, void *val)
> +{
> +	return 0;
> +}
> +
> +static int
> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +		int len, const void *val)
> +{
> +	return 0;
> +}
> +
> +/*
> + * This function is called as KVM is completely shutting down.  We do not
> + * need to worry about locking just nuke anything we have as quickly as possible
> + */
> +static void
> +ioregion_destructor(struct kvm_io_device *this)
> +{
> +	struct ioregion *p = to_ioregion(this);
> +
> +	ioregion_release(p);
> +}
> +
> +static const struct kvm_io_device_ops ioregion_ops = {
> +	.read       = ioregion_read,
> +	.write      = ioregion_write,
> +	.destructor = ioregion_destructor,
> +};
> +
> +static inline struct list_head *
> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> +{
> +	return (bus_idx == KVM_MMIO_BUS) ?
> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> +}
> +
> +/* check for not overlapping case and reverse */
> +inline bool
> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> +{
> +	u64 end1 = start1 + size1 - 1;
> +	u64 end2 = start2 + size2 - 1;
> +
> +	return !(end1 < start2 || start1 >= end2);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> +		      u64 start, u64 size)
> +{
> +	struct ioregion *_p;
> +	struct list_head *ioregions;
> +
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +	list_for_each_entry(_p, ioregions, list)
> +		if (overlap(start, size, _p->paddr, _p->size))
> +			return true;
> +
> +	return false;
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static bool
> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum kvm_bus bus_idx)
> +{
> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> +		return true;
> +
> +	return false;
> +}
> +
> +static enum kvm_bus
> +get_bus_from_flags(__u32 flags)
> +{
> +	if (flags & KVM_IOREGION_PIO)
> +		return KVM_PIO_BUS;
> +	return KVM_MMIO_BUS;
> +}
> +
> +int
> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion *p;
> +	bool is_posted_writes;
> +	struct file *rfile, *wfile;
> +	enum kvm_bus bus_idx;
> +	int ret = 0;
> +
> +	if (!args->memory_size)
> +		return -EINVAL;
> +	if ((args->guest_paddr + args->memory_size - 1) < args->guest_paddr)
> +		return -EINVAL;
> +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> +		return -EINVAL;
> +
> +	rfile = fget(args->rfd);
> +	if (!rfile)
> +		return -EBADF;
> +	wfile = fget(args->wfd);
> +	if (!wfile) {
> +		fput(rfile);
> +		return -EBADF;
> +	}
> +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags & O_NONBLOCK)) {
> +		ret = -EINVAL;
> +		goto fail;
> +	}


Instead of checking nonblocking, can we poll here?


> +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	INIT_LIST_HEAD(&p->list);
> +	p->paddr = args->guest_paddr;
> +	p->size = args->memory_size;
> +	p->user_data = args->user_data;
> +	p->rf = rfile;
> +	p->wf = wfile;
> +	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> +	p->posted_writes = is_posted_writes ? true : false;
> +	bus_idx = get_bus_from_flags(args->flags);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	if (ioregion_collision(kvm, p, bus_idx)) {
> +		ret = -EEXIST;
> +		goto unlock_fail;
> +	}
> +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> +				      &p->dev);
> +	if (ret < 0)
> +		goto unlock_fail;


We probably need to register to FAST_MMIO when bus_idx is MMIO.


> +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return 0;
> +
> +unlock_fail:
> +	mutex_unlock(&kvm->slots_lock);
> +	kfree(p);
> +fail:
> +	fput(rfile);
> +	fput(wfile);
> +
> +	return ret;
> +}
> +
> +static int
> +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion         *p, *tmp;
> +	enum kvm_bus             bus_idx;
> +	int                      ret = -ENOENT;
> +	struct list_head        *ioregions;
> +
> +	if (args->rfd != -1 || args->wfd != -1)
> +		return -EINVAL;


If we want to use ioregion fd for doorbell, rfd is probably not 
necessary here.

Thanks


> +
> +	bus_idx = get_bus_from_flags(args->flags);
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> +		if (p->paddr == args->guest_paddr  &&
> +		    p->size == args->memory_size) {
> +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> +			ioregion_release(p);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +
> +int
> +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	if (args->rfd == -1 || args->wfd == -1)
> +		return kvm_rm_ioregion(kvm, args);
> +	return kvm_set_ioregion(kvm, args);
> +}
> diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> new file mode 100644
> index 000000000000..23ffa812ec7a
> --- /dev/null
> +++ b/virt/kvm/ioregion.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_IOREGION_H__
> +#define __KVM_IOREGION_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2541a17ff1c4..385d8ec6350d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
>   	mmgrab(current->mm);
>   	kvm->mm = current->mm;
>   	kvm_eventfd_init(kvm);
> +	kvm_ioregionfd_init(kvm);
>   	mutex_init(&kvm->lock);
>   	mutex_init(&kvm->irq_lock);
>   	mutex_init(&kvm->slots_lock);
> @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
>   		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
>   		break;
>   	}
> +	case KVM_SET_IOREGION: {
> +		struct kvm_ioregion data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&data, argp, sizeof(data)))
> +			goto out;
> +
> +		r = kvm_ioregionfd(kvm, &data);
> +		break;
> +	}
>   	case KVM_GET_DIRTY_LOG: {
>   		struct kvm_dirty_log log;
>   
> @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>   	if (!bus)
>   		return -ENOMEM;
>   
> -	/* exclude ioeventfd which is limited by maximum fd */
> -	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> -		return -ENOSPC;
> +	/* enforce hard limit if kmemcg is disabled and
> +	 * exclude ioeventfd which is limited by maximum fd
> +	 */
> +	if (!memcg_kmem_enabled())
> +		if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> +			return -ENOSPC;
>   
>   	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
>   			  GFP_KERNEL_ACCOUNT);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2020-12-29 10:02 ` [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations Elena Afanasova
  2020-12-29 12:00   ` Stefan Hajnoczi
@ 2020-12-31  3:46   ` Jason Wang
  2021-01-03 20:37     ` Elena Afanasova
  1 sibling, 1 reply; 28+ messages in thread
From: Jason Wang @ 2020-12-31  3:46 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2020/12/29 下午6:02, Elena Afanasova wrote:
> Signed-off-by: Elena Afanasova<eafanasova@gmail.com>
> ---
>   virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 157 insertions(+)
>
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index a200c3761343..8523f4126337 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -4,6 +4,33 @@
>   #include <kvm/iodev.h>
>   #include "eventfd.h"
>   
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +


I wonder do we need a seq in the protocol. It might be useful if we 
allow a pair of file descriptors to be used for multiple different ranges.

Thanks


> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2020-12-31  3:45   ` Jason Wang
@ 2021-01-03 20:32     ` Elena Afanasova
  2021-01-04  5:34       ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Afanasova @ 2021-01-03 20:32 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> 
> How about FAST_MMIO?
> 
I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be suitable
for triggers which could use posted writes. The struct ioregionfd_cmd
size bits and the data field will be unused in this case.

> 
> > Guest
> > read and write accesses are dispatched through the given ioregionfd
> > instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> > setting fds to -1.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> >   arch/x86/kvm/Kconfig     |   1 +
> >   arch/x86/kvm/Makefile    |   1 +
> >   arch/x86/kvm/x86.c       |   1 +
> >   include/linux/kvm_host.h |  17 +++
> >   include/uapi/linux/kvm.h |  23 ++++
> >   virt/kvm/Kconfig         |   3 +
> >   virt/kvm/eventfd.c       |  25 +++++
> >   virt/kvm/eventfd.h       |  14 +++
> >   virt/kvm/ioregion.c      | 233
> > +++++++++++++++++++++++++++++++++++++++
> >   virt/kvm/ioregion.h      |  15 +++
> >   virt/kvm/kvm_main.c      |  20 +++-
> >   11 files changed, 350 insertions(+), 3 deletions(-)
> >   create mode 100644 virt/kvm/eventfd.h
> >   create mode 100644 virt/kvm/ioregion.c
> >   create mode 100644 virt/kvm/ioregion.h
> > 
> > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > index f92dfd8ef10d..b914ef375199 100644
> > --- a/arch/x86/kvm/Kconfig
> > +++ b/arch/x86/kvm/Kconfig
> > @@ -33,6 +33,7 @@ config KVM
> >   	select HAVE_KVM_IRQ_BYPASS
> >   	select HAVE_KVM_IRQ_ROUTING
> >   	select HAVE_KVM_EVENTFD
> > +	select KVM_IOREGION
> >   	select KVM_ASYNC_PF
> >   	select USER_RETURN_NOTIFIER
> >   	select KVM_MMIO
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index b804444e16d4..b3b17dc9f7d4 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> >   kvm-y			+= $(KVM)/kvm_main.o
> > $(KVM)/coalesced_mmio.o \
> >   				$(KVM)/eventfd.o $(KVM)/irqchip.o
> > $(KVM)/vfio.o
> >   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> >   
> >   kvm-y			+= x86.o emulate.o i8259.o irq.o
> > lapic.o \
> >   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
> > mtrr.o \
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e545a8a613b1..ddb28f5ca252 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> > *kvm, long ext)
> >   	case KVM_CAP_X86_USER_SPACE_MSR:
> >   	case KVM_CAP_X86_MSR_FILTER:
> >   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > +	case KVM_CAP_IOREGIONFD:
> >   		r = 1;
> >   		break;
> >   	case KVM_CAP_SYNC_REGS:
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7f2e2a09ebbd..7cd667dddba9 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -470,6 +470,10 @@ struct kvm {
> >   		struct mutex      resampler_lock;
> >   	} irqfds;
> >   	struct list_head ioeventfds;
> > +#endif
> > +#ifdef CONFIG_KVM_IOREGION
> > +	struct list_head ioregions_mmio;
> > +	struct list_head ioregions_pio;
> >   #endif
> >   	struct kvm_vm_stat stat;
> >   	struct kvm_arch arch;
> > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
> > *kvm, struct kvm_ioeventfd *args)
> >   
> >   #endif /* CONFIG_HAVE_KVM_EVENTFD */
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +void kvm_ioregionfd_init(struct kvm *kvm);
> > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> > +
> > +#else
> > +
> > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > kvm_ioregion *args)
> > +{
> > +	return -ENOSYS;
> > +}
> > +#endif
> > +
> >   void kvm_arch_irq_routing_update(struct kvm *kvm);
> >   
> >   static inline void kvm_make_request(int req, struct kvm_vcpu
> > *vcpu)
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index ca41220b40b8..81e775778c66 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> >   	__u8  pad[36];
> >   };
> >   
> > +enum {
> > +	kvm_ioregion_flag_nr_pio,
> > +	kvm_ioregion_flag_nr_posted_writes,
> > +	kvm_ioregion_flag_nr_max,
> > +};
> > +
> > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > kvm_ioregion_flag_nr_posted_writes)
> > +
> > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > kvm_ioregion_flag_nr_max) - 1)
> > +
> > +struct kvm_ioregion {
> > +	__u64 guest_paddr; /* guest physical address */
> > +	__u64 memory_size; /* bytes */
> > +	__u64 user_data;
> 
> What will this field do? Is it a token?
> 
Yes, it’s an opaque token that can be used by userspace in order to
determine which MemoryRegion to dispatch.

> 
> > +	__s32 rfd;
> > +	__s32 wfd;
> > +	__u32 flags;
> > +	__u8  pad[28];
> > +};
> 
> Is this possible to register the same fd with multiple GPA ranges?
> If 
> not, do we need to check for fd collision?
> 
Yes, it’s possible to register the same fd with multiple GPA ranges.

> 
> > +
> >   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> >   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> >   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> >   #define KVM_CAP_X86_USER_SPACE_MSR 188
> >   #define KVM_CAP_X86_MSR_FILTER 189
> >   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > +#define KVM_CAP_IOREGIONFD 191
> >   
> >   #ifdef KVM_CAP_IRQ_ROUTING
> >   
> > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> >   					struct
> > kvm_userspace_memory_region)
> >   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> >   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > kvm_ioregion)
> >   
> >   /* enable ucontrol for s390 */
> >   struct kvm_s390_ucas_mapping {
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 1c37ccd5d402..5e6620bbf000 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> >          bool
> >          select EVENTFD
> >   
> > +config KVM_IOREGION
> > +       bool
> > +
> >   config KVM_MMIO
> >          bool
> >   
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index c2323c27a28b..aadb73903f8b 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -27,6 +27,7 @@
> >   #include <trace/events/kvm.h>
> >   
> >   #include <kvm/iodev.h>
> > +#include "ioregion.h"
> >   
> >   #ifdef CONFIG_HAVE_KVM_IRQFD
> >   
> > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > ioeventfd_ops = {
> >   	.destructor = ioeventfd_destructor,
> >   };
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +/* assumes kvm->slots_lock held */
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > +			  u64 start, u64 size)
> > +{
> > +	struct _ioeventfd *_p;
> > +
> > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > +		if (_p->bus_idx == bus_idx &&
> > +		    overlap(start, size, _p->addr,
> > +			    !_p->length ? 8 : _p->length))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +#endif
> > +
> >   /* assumes kvm->slots_lock held */
> >   static bool
> >   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > struct _ioeventfd *p)
> >   		       _p->datamatch == p->datamatch))))
> >   			return true;
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> > +					  !p->length ? 8 : p->length))
> > +			return true;
> > +#endif
> > +
> >   	return false;
> >   }
> >   
> > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > new file mode 100644
> > index 000000000000..73a621eebae3
> > --- /dev/null
> > +++ b/virt/kvm/eventfd.h
> > @@ -0,0 +1,14 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_EVENTFD_H__
> > +#define __KVM_EVENTFD_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > u64 size);
> > +#else
> > +static inline bool
> > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > new file mode 100644
> > index 000000000000..a200c3761343
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.c
> > @@ -0,0 +1,233 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/kvm_host.h>
> > +#include <linux/fs.h>
> > +#include <kvm/iodev.h>
> > +#include "eventfd.h"
> > +
> > +void
> > +kvm_ioregionfd_init(struct kvm *kvm)
> > +{
> > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > +}
> > +
> > +struct ioregion {
> > +	struct list_head     list;
> > +	u64                  paddr;
> > +	u64                  size;
> > +	struct file         *rf;
> > +	struct file         *wf;
> > +	u64                  user_data;
> > +	struct kvm_io_device dev;
> > +	bool                 posted_writes;
> > +};
> > +
> > +static inline struct ioregion *
> > +to_ioregion(struct kvm_io_device *dev)
> > +{
> > +	return container_of(dev, struct ioregion, dev);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static void
> > +ioregion_release(struct ioregion *p)
> > +{
> > +	fput(p->rf);
> > +	fput(p->wf);
> > +	list_del(&p->list);
> > +	kfree(p);
> > +}
> > +
> > +static int
> > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +	      int len, void *val)
> > +{
> > +	return 0;
> > +}
> > +
> > +static int
> > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +		int len, const void *val)
> > +{
> > +	return 0;
> > +}
> > +
> > +/*
> > + * This function is called as KVM is completely shutting down.  We
> > do not
> > + * need to worry about locking just nuke anything we have as
> > quickly as possible
> > + */
> > +static void
> > +ioregion_destructor(struct kvm_io_device *this)
> > +{
> > +	struct ioregion *p = to_ioregion(this);
> > +
> > +	ioregion_release(p);
> > +}
> > +
> > +static const struct kvm_io_device_ops ioregion_ops = {
> > +	.read       = ioregion_read,
> > +	.write      = ioregion_write,
> > +	.destructor = ioregion_destructor,
> > +};
> > +
> > +static inline struct list_head *
> > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > +{
> > +	return (bus_idx == KVM_MMIO_BUS) ?
> > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > +}
> > +
> > +/* check for not overlapping case and reverse */
> > +inline bool
> > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > +{
> > +	u64 end1 = start1 + size1 - 1;
> > +	u64 end2 = start2 + size2 - 1;
> > +
> > +	return !(end1 < start2 || start1 >= end2);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > +		      u64 start, u64 size)
> > +{
> > +	struct ioregion *_p;
> > +	struct list_head *ioregions;
> > +
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +	list_for_each_entry(_p, ioregions, list)
> > +		if (overlap(start, size, _p->paddr, _p->size))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static bool
> > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > kvm_bus bus_idx)
> > +{
> > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +static enum kvm_bus
> > +get_bus_from_flags(__u32 flags)
> > +{
> > +	if (flags & KVM_IOREGION_PIO)
> > +		return KVM_PIO_BUS;
> > +	return KVM_MMIO_BUS;
> > +}
> > +
> > +int
> > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion *p;
> > +	bool is_posted_writes;
> > +	struct file *rfile, *wfile;
> > +	enum kvm_bus bus_idx;
> > +	int ret = 0;
> > +
> > +	if (!args->memory_size)
> > +		return -EINVAL;
> > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > >guest_paddr)
> > +		return -EINVAL;
> > +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> > +		return -EINVAL;
> > +
> > +	rfile = fget(args->rfd);
> > +	if (!rfile)
> > +		return -EBADF;
> > +	wfile = fget(args->wfd);
> > +	if (!wfile) {
> > +		fput(rfile);
> > +		return -EBADF;
> > +	}
> > +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
> > O_NONBLOCK)) {
> > +		ret = -EINVAL;
> > +		goto fail;
> > +	}
> 
> Instead of checking nonblocking, can we poll here?
> 
Yes, it’s possible. It would be necessary in the case of out-of-order
requests. But since multiple in-flight messages don’t seem to be a use
case I’m not sure if it’s necessary. Typically device register accesses
should not take a long time, so making them asynchronous doesn't seem
like a practical advantage. Also this might complicate the code and
make it slower. What do you think?

> 
> > +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> > +	if (!p) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&p->list);
> > +	p->paddr = args->guest_paddr;
> > +	p->size = args->memory_size;
> > +	p->user_data = args->user_data;
> > +	p->rf = rfile;
> > +	p->wf = wfile;
> > +	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> > +	p->posted_writes = is_posted_writes ? true : false;
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	if (ioregion_collision(kvm, p, bus_idx)) {
> > +		ret = -EEXIST;
> > +		goto unlock_fail;
> > +	}
> > +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> > +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> > +				      &p->dev);
> > +	if (ret < 0)
> > +		goto unlock_fail;
> 
> We probably need to register to FAST_MMIO when bus_idx is MMIO.
> 
> 
> > +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return 0;
> > +
> > +unlock_fail:
> > +	mutex_unlock(&kvm->slots_lock);
> > +	kfree(p);
> > +fail:
> > +	fput(rfile);
> > +	fput(wfile);
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion         *p, *tmp;
> > +	enum kvm_bus             bus_idx;
> > +	int                      ret = -ENOENT;
> > +	struct list_head        *ioregions;
> > +
> > +	if (args->rfd != -1 || args->wfd != -1)
> > +		return -EINVAL;
> 
> If we want to use ioregion fd for doorbell, rfd is probably not 
> necessary here.
> 
This condition is simply a requirement that region can be deleted in
the case of both fds are set to -1.

> Thanks
> 
> 
> > +
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> > +		if (p->paddr == args->guest_paddr  &&
> > +		    p->size == args->memory_size) {
> > +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p-
> > >dev);
> > +			ioregion_release(p);
> > +			ret = 0;
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +int
> > +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	if (args->rfd == -1 || args->wfd == -1)
> > +		return kvm_rm_ioregion(kvm, args);
> > +	return kvm_set_ioregion(kvm, args);
> > +}
> > diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> > new file mode 100644
> > index 000000000000..23ffa812ec7a
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_IOREGION_H__
> > +#define __KVM_IOREGION_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> > +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
> > start, u64 size);
> > +#else
> > +static inline bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 2541a17ff1c4..385d8ec6350d 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long
> > type)
> >   	mmgrab(current->mm);
> >   	kvm->mm = current->mm;
> >   	kvm_eventfd_init(kvm);
> > +	kvm_ioregionfd_init(kvm);
> >   	mutex_init(&kvm->lock);
> >   	mutex_init(&kvm->irq_lock);
> >   	mutex_init(&kvm->slots_lock);
> > @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
> >   		r = kvm_vm_ioctl_set_memory_region(kvm,
> > &kvm_userspace_mem);
> >   		break;
> >   	}
> > +	case KVM_SET_IOREGION: {
> > +		struct kvm_ioregion data;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&data, argp, sizeof(data)))
> > +			goto out;
> > +
> > +		r = kvm_ioregionfd(kvm, &data);
> > +		break;
> > +	}
> >   	case KVM_GET_DIRTY_LOG: {
> >   		struct kvm_dirty_log log;
> >   
> > @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm,
> > enum kvm_bus bus_idx, gpa_t addr,
> >   	if (!bus)
> >   		return -ENOMEM;
> >   
> > -	/* exclude ioeventfd which is limited by maximum fd */
> > -	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> > -		return -ENOSPC;
> > +	/* enforce hard limit if kmemcg is disabled and
> > +	 * exclude ioeventfd which is limited by maximum fd
> > +	 */
> > +	if (!memcg_kmem_enabled())
> > +		if (bus->dev_count - bus->ioeventfd_count >
> > NR_IOBUS_DEVS - 1)
> > +			return -ENOSPC;
> >   
> >   	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
> >   			  GFP_KERNEL_ACCOUNT);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2020-12-31  3:46   ` Jason Wang
@ 2021-01-03 20:37     ` Elena Afanasova
  2021-01-04  5:37       ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Afanasova @ 2021-01-03 20:37 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote:
> On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > Signed-off-by: Elena Afanasova<eafanasova@gmail.com>
> > ---
> >   virt/kvm/ioregion.c | 157
> > ++++++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 157 insertions(+)
> > 
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index a200c3761343..8523f4126337 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -4,6 +4,33 @@
> >   #include <kvm/iodev.h>
> >   #include "eventfd.h"
> >   
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> 
> I wonder do we need a seq in the protocol. It might be useful if we 
> allow a pair of file descriptors to be used for multiple different
> ranges.
> 
I think it might be helpful in the case of out-of-order requests. 
In the case of in order requests seq field seems not to be necessary
since there will be cmds/replies serialization. I’ll include the
synchronization code in a RFC v2 series.

> Thanks
> 
> 
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-03 20:32     ` Elena Afanasova
@ 2021-01-04  5:34       ` Jason Wang
  2021-01-05  0:02         ` Elena Afanasova
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-04  5:34 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/4 上午4:32, Elena Afanasova wrote:
> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
>> How about FAST_MMIO?
>>
> I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be suitable
> for triggers which could use posted writes. The struct ioregionfd_cmd
> size bits and the data field will be unused in this case.


Note that eventfd checks for length and have datamatch support. Do we 
need to do something similar.

I guess the idea is to have a generic interface to let eventfd work for 
ioregion as well.


>
>>> Guest
>>> read and write accesses are dispatched through the given ioregionfd
>>> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
>>> setting fds to -1.
>>>
>>> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
>>> ---
>>>    arch/x86/kvm/Kconfig     |   1 +
>>>    arch/x86/kvm/Makefile    |   1 +
>>>    arch/x86/kvm/x86.c       |   1 +
>>>    include/linux/kvm_host.h |  17 +++
>>>    include/uapi/linux/kvm.h |  23 ++++
>>>    virt/kvm/Kconfig         |   3 +
>>>    virt/kvm/eventfd.c       |  25 +++++
>>>    virt/kvm/eventfd.h       |  14 +++
>>>    virt/kvm/ioregion.c      | 233
>>> +++++++++++++++++++++++++++++++++++++++
>>>    virt/kvm/ioregion.h      |  15 +++
>>>    virt/kvm/kvm_main.c      |  20 +++-
>>>    11 files changed, 350 insertions(+), 3 deletions(-)
>>>    create mode 100644 virt/kvm/eventfd.h
>>>    create mode 100644 virt/kvm/ioregion.c
>>>    create mode 100644 virt/kvm/ioregion.h
>>>
>>> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
>>> index f92dfd8ef10d..b914ef375199 100644
>>> --- a/arch/x86/kvm/Kconfig
>>> +++ b/arch/x86/kvm/Kconfig
>>> @@ -33,6 +33,7 @@ config KVM
>>>    	select HAVE_KVM_IRQ_BYPASS
>>>    	select HAVE_KVM_IRQ_ROUTING
>>>    	select HAVE_KVM_EVENTFD
>>> +	select KVM_IOREGION
>>>    	select KVM_ASYNC_PF
>>>    	select USER_RETURN_NOTIFIER
>>>    	select KVM_MMIO
>>> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
>>> index b804444e16d4..b3b17dc9f7d4 100644
>>> --- a/arch/x86/kvm/Makefile
>>> +++ b/arch/x86/kvm/Makefile
>>> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>>>    kvm-y			+= $(KVM)/kvm_main.o
>>> $(KVM)/coalesced_mmio.o \
>>>    				$(KVM)/eventfd.o $(KVM)/irqchip.o
>>> $(KVM)/vfio.o
>>>    kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>>> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>>>    
>>>    kvm-y			+= x86.o emulate.o i8259.o irq.o
>>> lapic.o \
>>>    			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
>>> mtrr.o \
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index e545a8a613b1..ddb28f5ca252 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
>>> *kvm, long ext)
>>>    	case KVM_CAP_X86_USER_SPACE_MSR:
>>>    	case KVM_CAP_X86_MSR_FILTER:
>>>    	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
>>> +	case KVM_CAP_IOREGIONFD:
>>>    		r = 1;
>>>    		break;
>>>    	case KVM_CAP_SYNC_REGS:
>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>> index 7f2e2a09ebbd..7cd667dddba9 100644
>>> --- a/include/linux/kvm_host.h
>>> +++ b/include/linux/kvm_host.h
>>> @@ -470,6 +470,10 @@ struct kvm {
>>>    		struct mutex      resampler_lock;
>>>    	} irqfds;
>>>    	struct list_head ioeventfds;
>>> +#endif
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +	struct list_head ioregions_mmio;
>>> +	struct list_head ioregions_pio;
>>>    #endif
>>>    	struct kvm_vm_stat stat;
>>>    	struct kvm_arch arch;
>>> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
>>> *kvm, struct kvm_ioeventfd *args)
>>>    
>>>    #endif /* CONFIG_HAVE_KVM_EVENTFD */
>>>    
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +void kvm_ioregionfd_init(struct kvm *kvm);
>>> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
>>> +
>>> +#else
>>> +
>>> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
>>> +static inline int kvm_ioregionfd(struct kvm *kvm, struct
>>> kvm_ioregion *args)
>>> +{
>>> +	return -ENOSYS;
>>> +}
>>> +#endif
>>> +
>>>    void kvm_arch_irq_routing_update(struct kvm *kvm);
>>>    
>>>    static inline void kvm_make_request(int req, struct kvm_vcpu
>>> *vcpu)
>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>> index ca41220b40b8..81e775778c66 100644
>>> --- a/include/uapi/linux/kvm.h
>>> +++ b/include/uapi/linux/kvm.h
>>> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>>>    	__u8  pad[36];
>>>    };
>>>    
>>> +enum {
>>> +	kvm_ioregion_flag_nr_pio,
>>> +	kvm_ioregion_flag_nr_posted_writes,
>>> +	kvm_ioregion_flag_nr_max,
>>> +};
>>> +
>>> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
>>> +#define KVM_IOREGION_POSTED_WRITES (1 <<
>>> kvm_ioregion_flag_nr_posted_writes)
>>> +
>>> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
>>> kvm_ioregion_flag_nr_max) - 1)
>>> +
>>> +struct kvm_ioregion {
>>> +	__u64 guest_paddr; /* guest physical address */
>>> +	__u64 memory_size; /* bytes */
>>> +	__u64 user_data;
>> What will this field do? Is it a token?
>>
> Yes, it’s an opaque token that can be used by userspace in order to
> determine which MemoryRegion to dispatch.


This part I don't understand. Userspace should know the fd number (which 
I guess should be sufficient?).


>
>>> +	__s32 rfd;
>>> +	__s32 wfd;
>>> +	__u32 flags;
>>> +	__u8  pad[28];
>>> +};
>> Is this possible to register the same fd with multiple GPA ranges?
>> If
>> not, do we need to check for fd collision?
>>
> Yes, it’s possible to register the same fd with multiple GPA ranges.
>
>>> +
>>>    #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>>>    #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>>>    #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
>>> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>>>    #define KVM_CAP_X86_USER_SPACE_MSR 188
>>>    #define KVM_CAP_X86_MSR_FILTER 189
>>>    #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
>>> +#define KVM_CAP_IOREGIONFD 191
>>>    
>>>    #ifdef KVM_CAP_IRQ_ROUTING
>>>    
>>> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>>>    					struct
>>> kvm_userspace_memory_region)
>>>    #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>>>    #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
>>> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
>>> kvm_ioregion)
>>>    
>>>    /* enable ucontrol for s390 */
>>>    struct kvm_s390_ucas_mapping {
>>> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
>>> index 1c37ccd5d402..5e6620bbf000 100644
>>> --- a/virt/kvm/Kconfig
>>> +++ b/virt/kvm/Kconfig
>>> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>>>           bool
>>>           select EVENTFD
>>>    
>>> +config KVM_IOREGION
>>> +       bool
>>> +
>>>    config KVM_MMIO
>>>           bool
>>>    
>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>> index c2323c27a28b..aadb73903f8b 100644
>>> --- a/virt/kvm/eventfd.c
>>> +++ b/virt/kvm/eventfd.c
>>> @@ -27,6 +27,7 @@
>>>    #include <trace/events/kvm.h>
>>>    
>>>    #include <kvm/iodev.h>
>>> +#include "ioregion.h"
>>>    
>>>    #ifdef CONFIG_HAVE_KVM_IRQFD
>>>    
>>> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
>>> ioeventfd_ops = {
>>>    	.destructor = ioeventfd_destructor,
>>>    };
>>>    
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +/* assumes kvm->slots_lock held */
>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
>>> +			  u64 start, u64 size)
>>> +{
>>> +	struct _ioeventfd *_p;
>>> +
>>> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
>>> +		if (_p->bus_idx == bus_idx &&
>>> +		    overlap(start, size, _p->addr,
>>> +			    !_p->length ? 8 : _p->length))
>>> +			return true;
>>> +
>>> +	return false;
>>> +}
>>> +#endif
>>> +
>>>    /* assumes kvm->slots_lock held */
>>>    static bool
>>>    ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>>> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
>>> struct _ioeventfd *p)
>>>    		       _p->datamatch == p->datamatch))))
>>>    			return true;
>>>    
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
>>> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
>>> +					  !p->length ? 8 : p->length))
>>> +			return true;
>>> +#endif
>>> +
>>>    	return false;
>>>    }
>>>    
>>> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
>>> new file mode 100644
>>> index 000000000000..73a621eebae3
>>> --- /dev/null
>>> +++ b/virt/kvm/eventfd.h
>>> @@ -0,0 +1,14 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +#ifndef __KVM_EVENTFD_H__
>>> +#define __KVM_EVENTFD_H__
>>> +
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
>>> u64 size);
>>> +#else
>>> +static inline bool
>>> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
>>> size)
>>> +{
>>> +	return false;
>>> +}
>>> +#endif
>>> +#endif
>>> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
>>> new file mode 100644
>>> index 000000000000..a200c3761343
>>> --- /dev/null
>>> +++ b/virt/kvm/ioregion.c
>>> @@ -0,0 +1,233 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +#include <linux/kvm_host.h>
>>> +#include <linux/fs.h>
>>> +#include <kvm/iodev.h>
>>> +#include "eventfd.h"
>>> +
>>> +void
>>> +kvm_ioregionfd_init(struct kvm *kvm)
>>> +{
>>> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
>>> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
>>> +}
>>> +
>>> +struct ioregion {
>>> +	struct list_head     list;
>>> +	u64                  paddr;
>>> +	u64                  size;
>>> +	struct file         *rf;
>>> +	struct file         *wf;
>>> +	u64                  user_data;
>>> +	struct kvm_io_device dev;
>>> +	bool                 posted_writes;
>>> +};
>>> +
>>> +static inline struct ioregion *
>>> +to_ioregion(struct kvm_io_device *dev)
>>> +{
>>> +	return container_of(dev, struct ioregion, dev);
>>> +}
>>> +
>>> +/* assumes kvm->slots_lock held */
>>> +static void
>>> +ioregion_release(struct ioregion *p)
>>> +{
>>> +	fput(p->rf);
>>> +	fput(p->wf);
>>> +	list_del(&p->list);
>>> +	kfree(p);
>>> +}
>>> +
>>> +static int
>>> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
>>> gpa_t addr,
>>> +	      int len, void *val)
>>> +{
>>> +	return 0;
>>> +}
>>> +
>>> +static int
>>> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
>>> gpa_t addr,
>>> +		int len, const void *val)
>>> +{
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>> + * This function is called as KVM is completely shutting down.  We
>>> do not
>>> + * need to worry about locking just nuke anything we have as
>>> quickly as possible
>>> + */
>>> +static void
>>> +ioregion_destructor(struct kvm_io_device *this)
>>> +{
>>> +	struct ioregion *p = to_ioregion(this);
>>> +
>>> +	ioregion_release(p);
>>> +}
>>> +
>>> +static const struct kvm_io_device_ops ioregion_ops = {
>>> +	.read       = ioregion_read,
>>> +	.write      = ioregion_write,
>>> +	.destructor = ioregion_destructor,
>>> +};
>>> +
>>> +static inline struct list_head *
>>> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
>>> +{
>>> +	return (bus_idx == KVM_MMIO_BUS) ?
>>> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
>>> +}
>>> +
>>> +/* check for not overlapping case and reverse */
>>> +inline bool
>>> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
>>> +{
>>> +	u64 end1 = start1 + size1 - 1;
>>> +	u64 end2 = start2 + size2 - 1;
>>> +
>>> +	return !(end1 < start2 || start1 >= end2);
>>> +}
>>> +
>>> +/* assumes kvm->slots_lock held */
>>> +bool
>>> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
>>> +		      u64 start, u64 size)
>>> +{
>>> +	struct ioregion *_p;
>>> +	struct list_head *ioregions;
>>> +
>>> +	ioregions = get_ioregion_list(kvm, bus_idx);
>>> +	list_for_each_entry(_p, ioregions, list)
>>> +		if (overlap(start, size, _p->paddr, _p->size))
>>> +			return true;
>>> +
>>> +	return false;
>>> +}
>>> +
>>> +/* assumes kvm->slots_lock held */
>>> +static bool
>>> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
>>> kvm_bus bus_idx)
>>> +{
>>> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
>>> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
>>> +		return true;
>>> +
>>> +	return false;
>>> +}
>>> +
>>> +static enum kvm_bus
>>> +get_bus_from_flags(__u32 flags)
>>> +{
>>> +	if (flags & KVM_IOREGION_PIO)
>>> +		return KVM_PIO_BUS;
>>> +	return KVM_MMIO_BUS;
>>> +}
>>> +
>>> +int
>>> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
>>> +{
>>> +	struct ioregion *p;
>>> +	bool is_posted_writes;
>>> +	struct file *rfile, *wfile;
>>> +	enum kvm_bus bus_idx;
>>> +	int ret = 0;
>>> +
>>> +	if (!args->memory_size)
>>> +		return -EINVAL;
>>> +	if ((args->guest_paddr + args->memory_size - 1) < args-
>>>> guest_paddr)
>>> +		return -EINVAL;
>>> +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
>>> +		return -EINVAL;
>>> +
>>> +	rfile = fget(args->rfd);
>>> +	if (!rfile)
>>> +		return -EBADF;
>>> +	wfile = fget(args->wfd);
>>> +	if (!wfile) {
>>> +		fput(rfile);
>>> +		return -EBADF;
>>> +	}
>>> +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
>>> O_NONBLOCK)) {
>>> +		ret = -EINVAL;
>>> +		goto fail;
>>> +	}
>> Instead of checking nonblocking, can we poll here?
>>
> Yes, it’s possible. It would be necessary in the case of out-of-order
> requests. But since multiple in-flight messages don’t seem to be a use
> case I’m not sure if it’s necessary. Typically device register accesses
> should not take a long time, so making them asynchronous doesn't seem
> like a practical advantage. Also this might complicate the code and
> make it slower. What do you think?


One issue I saw is that, if we register a single fd for e.g two regions. 
And those two regions were read in parallel from guest. It looks to me 
we don't have any synchronization in the current code.


>
>>> +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
>>> +	if (!p) {
>>> +		ret = -ENOMEM;
>>> +		goto fail;
>>> +	}
>>> +
>>> +	INIT_LIST_HEAD(&p->list);
>>> +	p->paddr = args->guest_paddr;
>>> +	p->size = args->memory_size;
>>> +	p->user_data = args->user_data;
>>> +	p->rf = rfile;
>>> +	p->wf = wfile;
>>> +	is_posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
>>> +	p->posted_writes = is_posted_writes ? true : false;
>>> +	bus_idx = get_bus_from_flags(args->flags);
>>> +
>>> +	mutex_lock(&kvm->slots_lock);
>>> +
>>> +	if (ioregion_collision(kvm, p, bus_idx)) {
>>> +		ret = -EEXIST;
>>> +		goto unlock_fail;
>>> +	}
>>> +	kvm_iodevice_init(&p->dev, &ioregion_ops);
>>> +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
>>> +				      &p->dev);
>>> +	if (ret < 0)
>>> +		goto unlock_fail;
>> We probably need to register to FAST_MMIO when bus_idx is MMIO.
>>
>>
>>> +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
>>> +
>>> +	mutex_unlock(&kvm->slots_lock);
>>> +
>>> +	return 0;
>>> +
>>> +unlock_fail:
>>> +	mutex_unlock(&kvm->slots_lock);
>>> +	kfree(p);
>>> +fail:
>>> +	fput(rfile);
>>> +	fput(wfile);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static int
>>> +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
>>> +{
>>> +	struct ioregion         *p, *tmp;
>>> +	enum kvm_bus             bus_idx;
>>> +	int                      ret = -ENOENT;
>>> +	struct list_head        *ioregions;
>>> +
>>> +	if (args->rfd != -1 || args->wfd != -1)
>>> +		return -EINVAL;
>> If we want to use ioregion fd for doorbell, rfd is probably not
>> necessary here.
>>
> This condition is simply a requirement that region can be deleted in
> the case of both fds are set to -1.


Ok.

Thanks


>
>> Thanks
>>
>>
>>> +
>>> +	bus_idx = get_bus_from_flags(args->flags);
>>> +	ioregions = get_ioregion_list(kvm, bus_idx);
>>> +
>>> +	mutex_lock(&kvm->slots_lock);
>>> +
>>> +	list_for_each_entry_safe(p, tmp, ioregions, list) {
>>> +		if (p->paddr == args->guest_paddr  &&
>>> +		    p->size == args->memory_size) {
>>> +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p-
>>>> dev);
>>> +			ioregion_release(p);
>>> +			ret = 0;
>>> +			break;
>>> +		}
>>> +	}
>>> +
>>> +	mutex_unlock(&kvm->slots_lock);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +int
>>> +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
>>> +{
>>> +	if (args->rfd == -1 || args->wfd == -1)
>>> +		return kvm_rm_ioregion(kvm, args);
>>> +	return kvm_set_ioregion(kvm, args);
>>> +}
>>> diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
>>> new file mode 100644
>>> index 000000000000..23ffa812ec7a
>>> --- /dev/null
>>> +++ b/virt/kvm/ioregion.h
>>> @@ -0,0 +1,15 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +#ifndef __KVM_IOREGION_H__
>>> +#define __KVM_IOREGION_H__
>>> +
>>> +#ifdef CONFIG_KVM_IOREGION
>>> +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
>>> +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
>>> start, u64 size);
>>> +#else
>>> +static inline bool
>>> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64
>>> size)
>>> +{
>>> +	return false;
>>> +}
>>> +#endif
>>> +#endif
>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>> index 2541a17ff1c4..385d8ec6350d 100644
>>> --- a/virt/kvm/kvm_main.c
>>> +++ b/virt/kvm/kvm_main.c
>>> @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long
>>> type)
>>>    	mmgrab(current->mm);
>>>    	kvm->mm = current->mm;
>>>    	kvm_eventfd_init(kvm);
>>> +	kvm_ioregionfd_init(kvm);
>>>    	mutex_init(&kvm->lock);
>>>    	mutex_init(&kvm->irq_lock);
>>>    	mutex_init(&kvm->slots_lock);
>>> @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
>>>    		r = kvm_vm_ioctl_set_memory_region(kvm,
>>> &kvm_userspace_mem);
>>>    		break;
>>>    	}
>>> +	case KVM_SET_IOREGION: {
>>> +		struct kvm_ioregion data;
>>> +
>>> +		r = -EFAULT;
>>> +		if (copy_from_user(&data, argp, sizeof(data)))
>>> +			goto out;
>>> +
>>> +		r = kvm_ioregionfd(kvm, &data);
>>> +		break;
>>> +	}
>>>    	case KVM_GET_DIRTY_LOG: {
>>>    		struct kvm_dirty_log log;
>>>    
>>> @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm,
>>> enum kvm_bus bus_idx, gpa_t addr,
>>>    	if (!bus)
>>>    		return -ENOMEM;
>>>    
>>> -	/* exclude ioeventfd which is limited by maximum fd */
>>> -	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
>>> -		return -ENOSPC;
>>> +	/* enforce hard limit if kmemcg is disabled and
>>> +	 * exclude ioeventfd which is limited by maximum fd
>>> +	 */
>>> +	if (!memcg_kmem_enabled())
>>> +		if (bus->dev_count - bus->ioeventfd_count >
>>> NR_IOBUS_DEVS - 1)
>>> +			return -ENOSPC;
>>>    
>>>    	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
>>>    			  GFP_KERNEL_ACCOUNT);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2021-01-03 20:37     ` Elena Afanasova
@ 2021-01-04  5:37       ` Jason Wang
  2021-01-05  0:06         ` Elena Afanasova
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-04  5:37 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/4 上午4:37, Elena Afanasova wrote:
> On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote:
>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>> Signed-off-by: Elena Afanasova<eafanasova@gmail.com>
>>> ---
>>>    virt/kvm/ioregion.c | 157
>>> ++++++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 157 insertions(+)
>>>
>>> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
>>> index a200c3761343..8523f4126337 100644
>>> --- a/virt/kvm/ioregion.c
>>> +++ b/virt/kvm/ioregion.c
>>> @@ -4,6 +4,33 @@
>>>    #include <kvm/iodev.h>
>>>    #include "eventfd.h"
>>>    
>>> +/* Wire protocol */
>>> +struct ioregionfd_cmd {
>>> +	__u32 info;
>>> +	__u32 padding;
>>> +	__u64 user_data;
>>> +	__u64 offset;
>>> +	__u64 data;
>>> +};
>>> +
>> I wonder do we need a seq in the protocol. It might be useful if we
>> allow a pair of file descriptors to be used for multiple different
>> ranges.
>>
> I think it might be helpful in the case of out-of-order requests.
> In the case of in order requests seq field seems not to be necessary
> since there will be cmds/replies serialization. I’ll include the
> synchronization code in a RFC v2 series.


See my reply to V1. It might be helpful for the case of using single 
ioregionfd for multiple ranges.

Thanks


>
>> Thanks
>>
>>
>>> +struct ioregionfd_resp {
>>> +	__u64 data;
>>> +	__u8 pad[24];
>>> +};


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-04  5:34       ` Jason Wang
@ 2021-01-05  0:02         ` Elena Afanasova
  2021-01-05  3:53           ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Afanasova @ 2021-01-05  0:02 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> > > How about FAST_MMIO?
> > > 
> > I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
> > suitable
> > for triggers which could use posted writes. The struct
> > ioregionfd_cmd
> > size bits and the data field will be unused in this case.
> 
> Note that eventfd checks for length and have datamatch support. Do
> we 
> need to do something similar.
> 
Do you think datamatch support is necessary for ioregionfd?

> I guess the idea is to have a generic interface to let eventfd work
> for 
> ioregion as well.
> 
It seems that posted writes is the only "fast" case in ioregionfd. So I
was thinking about using FAST_MMIO for this case only. Maybe in some
cases it will be better to just use ioeventfd. But I'm not sure.

> 
> > > > Guest
> > > > read and write accesses are dispatched through the given
> > > > ioregionfd
> > > > instead of returning from ioctl(KVM_RUN). Regions can be
> > > > deleted by
> > > > setting fds to -1.
> > > > 
> > > > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > > > ---
> > > >    arch/x86/kvm/Kconfig     |   1 +
> > > >    arch/x86/kvm/Makefile    |   1 +
> > > >    arch/x86/kvm/x86.c       |   1 +
> > > >    include/linux/kvm_host.h |  17 +++
> > > >    include/uapi/linux/kvm.h |  23 ++++
> > > >    virt/kvm/Kconfig         |   3 +
> > > >    virt/kvm/eventfd.c       |  25 +++++
> > > >    virt/kvm/eventfd.h       |  14 +++
> > > >    virt/kvm/ioregion.c      | 233
> > > > +++++++++++++++++++++++++++++++++++++++
> > > >    virt/kvm/ioregion.h      |  15 +++
> > > >    virt/kvm/kvm_main.c      |  20 +++-
> > > >    11 files changed, 350 insertions(+), 3 deletions(-)
> > > >    create mode 100644 virt/kvm/eventfd.h
> > > >    create mode 100644 virt/kvm/ioregion.c
> > > >    create mode 100644 virt/kvm/ioregion.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > > > index f92dfd8ef10d..b914ef375199 100644
> > > > --- a/arch/x86/kvm/Kconfig
> > > > +++ b/arch/x86/kvm/Kconfig
> > > > @@ -33,6 +33,7 @@ config KVM
> > > >    	select HAVE_KVM_IRQ_BYPASS
> > > >    	select HAVE_KVM_IRQ_ROUTING
> > > >    	select HAVE_KVM_EVENTFD
> > > > +	select KVM_IOREGION
> > > >    	select KVM_ASYNC_PF
> > > >    	select USER_RETURN_NOTIFIER
> > > >    	select KVM_MMIO
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index b804444e16d4..b3b17dc9f7d4 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> > > >    kvm-y			+= $(KVM)/kvm_main.o
> > > > $(KVM)/coalesced_mmio.o \
> > > >    				$(KVM)/eventfd.o
> > > > $(KVM)/irqchip.o
> > > > $(KVM)/vfio.o
> > > >    kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > > > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> > > >    
> > > >    kvm-y			+= x86.o emulate.o i8259.o
> > > > irq.o
> > > > lapic.o \
> > > >    			   i8254.o ioapic.o irq_comm.o cpuid.o
> > > > pmu.o
> > > > mtrr.o \
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index e545a8a613b1..ddb28f5ca252 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct
> > > > kvm
> > > > *kvm, long ext)
> > > >    	case KVM_CAP_X86_USER_SPACE_MSR:
> > > >    	case KVM_CAP_X86_MSR_FILTER:
> > > >    	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > > > +	case KVM_CAP_IOREGIONFD:
> > > >    		r = 1;
> > > >    		break;
> > > >    	case KVM_CAP_SYNC_REGS:
> > > > diff --git a/include/linux/kvm_host.h
> > > > b/include/linux/kvm_host.h
> > > > index 7f2e2a09ebbd..7cd667dddba9 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -470,6 +470,10 @@ struct kvm {
> > > >    		struct mutex      resampler_lock;
> > > >    	} irqfds;
> > > >    	struct list_head ioeventfds;
> > > > +#endif
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +	struct list_head ioregions_mmio;
> > > > +	struct list_head ioregions_pio;
> > > >    #endif
> > > >    	struct kvm_vm_stat stat;
> > > >    	struct kvm_arch arch;
> > > > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct
> > > > kvm
> > > > *kvm, struct kvm_ioeventfd *args)
> > > >    
> > > >    #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > >    
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +void kvm_ioregionfd_init(struct kvm *kvm);
> > > > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion
> > > > *args);
> > > > +
> > > > +#else
> > > > +
> > > > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > > > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > > > kvm_ioregion *args)
> > > > +{
> > > > +	return -ENOSYS;
> > > > +}
> > > > +#endif
> > > > +
> > > >    void kvm_arch_irq_routing_update(struct kvm *kvm);
> > > >    
> > > >    static inline void kvm_make_request(int req, struct kvm_vcpu
> > > > *vcpu)
> > > > diff --git a/include/uapi/linux/kvm.h
> > > > b/include/uapi/linux/kvm.h
> > > > index ca41220b40b8..81e775778c66 100644
> > > > --- a/include/uapi/linux/kvm.h
> > > > +++ b/include/uapi/linux/kvm.h
> > > > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> > > >    	__u8  pad[36];
> > > >    };
> > > >    
> > > > +enum {
> > > > +	kvm_ioregion_flag_nr_pio,
> > > > +	kvm_ioregion_flag_nr_posted_writes,
> > > > +	kvm_ioregion_flag_nr_max,
> > > > +};
> > > > +
> > > > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > > > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > > > kvm_ioregion_flag_nr_posted_writes)
> > > > +
> > > > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > > > kvm_ioregion_flag_nr_max) - 1)
> > > > +
> > > > +struct kvm_ioregion {
> > > > +	__u64 guest_paddr; /* guest physical address */
> > > > +	__u64 memory_size; /* bytes */
> > > > +	__u64 user_data;
> > > What will this field do? Is it a token?
> > > 
> > Yes, it’s an opaque token that can be used by userspace in order to
> > determine which MemoryRegion to dispatch.
> 
> This part I don't understand. Userspace should know the fd number
> (which 
> I guess should be sufficient?).
> 
I think the user_data field can be useful if same fd is registered with
multiple GPA ranges.

> 
> > > > +	__s32 rfd;
> > > > +	__s32 wfd;
> > > > +	__u32 flags;
> > > > +	__u8  pad[28];
> > > > +};
> > > Is this possible to register the same fd with multiple GPA
> > > ranges?
> > > If
> > > not, do we need to check for fd collision?
> > > 
> > Yes, it’s possible to register the same fd with multiple GPA
> > ranges.
> > 
> > > > +
> > > >    #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> > > >    #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> > > >    #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > > > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> > > >    #define KVM_CAP_X86_USER_SPACE_MSR 188
> > > >    #define KVM_CAP_X86_MSR_FILTER 189
> > > >    #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > > > +#define KVM_CAP_IOREGIONFD 191
> > > >    
> > > >    #ifdef KVM_CAP_IRQ_ROUTING
> > > >    
> > > > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> > > >    					struct
> > > > kvm_userspace_memory_region)
> > > >    #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> > > >    #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > > > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > > > kvm_ioregion)
> > > >    
> > > >    /* enable ucontrol for s390 */
> > > >    struct kvm_s390_ucas_mapping {
> > > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > > > index 1c37ccd5d402..5e6620bbf000 100644
> > > > --- a/virt/kvm/Kconfig
> > > > +++ b/virt/kvm/Kconfig
> > > > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> > > >           bool
> > > >           select EVENTFD
> > > >    
> > > > +config KVM_IOREGION
> > > > +       bool
> > > > +
> > > >    config KVM_MMIO
> > > >           bool
> > > >    
> > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > index c2323c27a28b..aadb73903f8b 100644
> > > > --- a/virt/kvm/eventfd.c
> > > > +++ b/virt/kvm/eventfd.c
> > > > @@ -27,6 +27,7 @@
> > > >    #include <trace/events/kvm.h>
> > > >    
> > > >    #include <kvm/iodev.h>
> > > > +#include "ioregion.h"
> > > >    
> > > >    #ifdef CONFIG_HAVE_KVM_IRQFD
> > > >    
> > > > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > > > ioeventfd_ops = {
> > > >    	.destructor = ioeventfd_destructor,
> > > >    };
> > > >    
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +/* assumes kvm->slots_lock held */
> > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > > > +			  u64 start, u64 size)
> > > > +{
> > > > +	struct _ioeventfd *_p;
> > > > +
> > > > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > > > +		if (_p->bus_idx == bus_idx &&
> > > > +		    overlap(start, size, _p->addr,
> > > > +			    !_p->length ? 8 : _p->length))
> > > > +			return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +
> > > >    /* assumes kvm->slots_lock held */
> > > >    static bool
> > > >    ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd
> > > > *p)
> > > > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > > > struct _ioeventfd *p)
> > > >    		       _p->datamatch == p->datamatch))))
> > > >    			return true;
> > > >    
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx ==
> > > > KVM_PIO_BUS)
> > > > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p-
> > > > >addr,
> > > > +					  !p->length ? 8 : p-
> > > > >length))
> > > > +			return true;
> > > > +#endif
> > > > +
> > > >    	return false;
> > > >    }
> > > >    
> > > > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > > > new file mode 100644
> > > > index 000000000000..73a621eebae3
> > > > --- /dev/null
> > > > +++ b/virt/kvm/eventfd.h
> > > > @@ -0,0 +1,14 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +#ifndef __KVM_EVENTFD_H__
> > > > +#define __KVM_EVENTFD_H__
> > > > +
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64
> > > > start,
> > > > u64 size);
> > > > +#else
> > > > +static inline bool
> > > > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > > > u64
> > > > size)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +#endif
> > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > > > new file mode 100644
> > > > index 000000000000..a200c3761343
> > > > --- /dev/null
> > > > +++ b/virt/kvm/ioregion.c
> > > > @@ -0,0 +1,233 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/fs.h>
> > > > +#include <kvm/iodev.h>
> > > > +#include "eventfd.h"
> > > > +
> > > > +void
> > > > +kvm_ioregionfd_init(struct kvm *kvm)
> > > > +{
> > > > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > > > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > > > +}
> > > > +
> > > > +struct ioregion {
> > > > +	struct list_head     list;
> > > > +	u64                  paddr;
> > > > +	u64                  size;
> > > > +	struct file         *rf;
> > > > +	struct file         *wf;
> > > > +	u64                  user_data;
> > > > +	struct kvm_io_device dev;
> > > > +	bool                 posted_writes;
> > > > +};
> > > > +
> > > > +static inline struct ioregion *
> > > > +to_ioregion(struct kvm_io_device *dev)
> > > > +{
> > > > +	return container_of(dev, struct ioregion, dev);
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +static void
> > > > +ioregion_release(struct ioregion *p)
> > > > +{
> > > > +	fput(p->rf);
> > > > +	fput(p->wf);
> > > > +	list_del(&p->list);
> > > > +	kfree(p);
> > > > +}
> > > > +
> > > > +static int
> > > > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device
> > > > *this,
> > > > gpa_t addr,
> > > > +	      int len, void *val)
> > > > +{
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device
> > > > *this,
> > > > gpa_t addr,
> > > > +		int len, const void *val)
> > > > +{
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * This function is called as KVM is completely shutting
> > > > down.  We
> > > > do not
> > > > + * need to worry about locking just nuke anything we have as
> > > > quickly as possible
> > > > + */
> > > > +static void
> > > > +ioregion_destructor(struct kvm_io_device *this)
> > > > +{
> > > > +	struct ioregion *p = to_ioregion(this);
> > > > +
> > > > +	ioregion_release(p);
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops ioregion_ops = {
> > > > +	.read       = ioregion_read,
> > > > +	.write      = ioregion_write,
> > > > +	.destructor = ioregion_destructor,
> > > > +};
> > > > +
> > > > +static inline struct list_head *
> > > > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > > > +{
> > > > +	return (bus_idx == KVM_MMIO_BUS) ?
> > > > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > > > +}
> > > > +
> > > > +/* check for not overlapping case and reverse */
> > > > +inline bool
> > > > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > > > +{
> > > > +	u64 end1 = start1 + size1 - 1;
> > > > +	u64 end2 = start2 + size2 - 1;
> > > > +
> > > > +	return !(end1 < start2 || start1 >= end2);
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +bool
> > > > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > > > +		      u64 start, u64 size)
> > > > +{
> > > > +	struct ioregion *_p;
> > > > +	struct list_head *ioregions;
> > > > +
> > > > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > > > +	list_for_each_entry(_p, ioregions, list)
> > > > +		if (overlap(start, size, _p->paddr, _p->size))
> > > > +			return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +static bool
> > > > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > > > kvm_bus bus_idx)
> > > > +{
> > > > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p-
> > > > >size) ||
> > > > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p-
> > > > >size))
> > > > +		return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +
> > > > +static enum kvm_bus
> > > > +get_bus_from_flags(__u32 flags)
> > > > +{
> > > > +	if (flags & KVM_IOREGION_PIO)
> > > > +		return KVM_PIO_BUS;
> > > > +	return KVM_MMIO_BUS;
> > > > +}
> > > > +
> > > > +int
> > > > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > > > +{
> > > > +	struct ioregion *p;
> > > > +	bool is_posted_writes;
> > > > +	struct file *rfile, *wfile;
> > > > +	enum kvm_bus bus_idx;
> > > > +	int ret = 0;
> > > > +
> > > > +	if (!args->memory_size)
> > > > +		return -EINVAL;
> > > > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > > > > guest_paddr)
> > > > +		return -EINVAL;
> > > > +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> > > > +		return -EINVAL;
> > > > +
> > > > +	rfile = fget(args->rfd);
> > > > +	if (!rfile)
> > > > +		return -EBADF;
> > > > +	wfile = fget(args->wfd);
> > > > +	if (!wfile) {
> > > > +		fput(rfile);
> > > > +		return -EBADF;
> > > > +	}
> > > > +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
> > > > O_NONBLOCK)) {
> > > > +		ret = -EINVAL;
> > > > +		goto fail;
> > > > +	}
> > > Instead of checking nonblocking, can we poll here?
> > > 
> > Yes, it’s possible. It would be necessary in the case of out-of-
> > order
> > requests. But since multiple in-flight messages don’t seem to be a
> > use
> > case I’m not sure if it’s necessary. Typically device register
> > accesses
> > should not take a long time, so making them asynchronous doesn't
> > seem
> > like a practical advantage. Also this might complicate the code and
> > make it slower. What do you think?
> 
> One issue I saw is that, if we register a single fd for e.g two
> regions. 
> And those two regions were read in parallel from guest. It looks to
> me 
> we don't have any synchronization in the current code.
> 
Yes, you are right. That’s why there will be cmds/replies serialization
in a v2 series.

> 
> > > > +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> > > > +	if (!p) {
> > > > +		ret = -ENOMEM;
> > > > +		goto fail;
> > > > +	}
> > > > +
> > > > +	INIT_LIST_HEAD(&p->list);
> > > > +	p->paddr = args->guest_paddr;
> > > > +	p->size = args->memory_size;
> > > > +	p->user_data = args->user_data;
> > > > +	p->rf = rfile;
> > > > +	p->wf = wfile;
> > > > +	is_posted_writes = args->flags &
> > > > KVM_IOREGION_POSTED_WRITES;
> > > > +	p->posted_writes = is_posted_writes ? true : false;
> > > > +	bus_idx = get_bus_from_flags(args->flags);
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +
> > > > +	if (ioregion_collision(kvm, p, bus_idx)) {
> > > > +		ret = -EEXIST;
> > > > +		goto unlock_fail;
> > > > +	}
> > > > +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> > > > +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr,
> > > > p->size,
> > > > +				      &p->dev);
> > > > +	if (ret < 0)
> > > > +		goto unlock_fail;
> > > We probably need to register to FAST_MMIO when bus_idx is MMIO.
> > > 
> > > 
> > > > +	list_add_tail(&p->list, get_ioregion_list(kvm,
> > > > bus_idx));
> > > > +
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +
> > > > +	return 0;
> > > > +
> > > > +unlock_fail:
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	kfree(p);
> > > > +fail:
> > > > +	fput(rfile);
> > > > +	fput(wfile);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int
> > > > +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > > > +{
> > > > +	struct ioregion         *p, *tmp;
> > > > +	enum kvm_bus             bus_idx;
> > > > +	int                      ret = -ENOENT;
> > > > +	struct list_head        *ioregions;
> > > > +
> > > > +	if (args->rfd != -1 || args->wfd != -1)
> > > > +		return -EINVAL;
> > > If we want to use ioregion fd for doorbell, rfd is probably not
> > > necessary here.
> > > 
> > This condition is simply a requirement that region can be deleted
> > in
> > the case of both fds are set to -1.
> 
> Ok.
> 
> Thanks
> 
> 
> > > Thanks
> > > 
> > > 
> > > > +
> > > > +	bus_idx = get_bus_from_flags(args->flags);
> > > > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +
> > > > +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> > > > +		if (p->paddr == args->guest_paddr  &&
> > > > +		    p->size == args->memory_size) {
> > > > +			kvm_io_bus_unregister_dev(kvm, bus_idx,
> > > > &p-
> > > > > dev);
> > > > +			ioregion_release(p);
> > > > +			ret = 0;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int
> > > > +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> > > > +{
> > > > +	if (args->rfd == -1 || args->wfd == -1)
> > > > +		return kvm_rm_ioregion(kvm, args);
> > > > +	return kvm_set_ioregion(kvm, args);
> > > > +}
> > > > diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> > > > new file mode 100644
> > > > index 000000000000..23ffa812ec7a
> > > > --- /dev/null
> > > > +++ b/virt/kvm/ioregion.h
> > > > @@ -0,0 +1,15 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +#ifndef __KVM_IOREGION_H__
> > > > +#define __KVM_IOREGION_H__
> > > > +
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +inline bool overlap(u64 start1, u64 size1, u64 start2, u64
> > > > size2);
> > > > +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
> > > > start, u64 size);
> > > > +#else
> > > > +static inline bool
> > > > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start,
> > > > u64
> > > > size)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +#endif
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index 2541a17ff1c4..385d8ec6350d 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned
> > > > long
> > > > type)
> > > >    	mmgrab(current->mm);
> > > >    	kvm->mm = current->mm;
> > > >    	kvm_eventfd_init(kvm);
> > > > +	kvm_ioregionfd_init(kvm);
> > > >    	mutex_init(&kvm->lock);
> > > >    	mutex_init(&kvm->irq_lock);
> > > >    	mutex_init(&kvm->slots_lock);
> > > > @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file
> > > > *filp,
> > > >    		r = kvm_vm_ioctl_set_memory_region(kvm,
> > > > &kvm_userspace_mem);
> > > >    		break;
> > > >    	}
> > > > +	case KVM_SET_IOREGION: {
> > > > +		struct kvm_ioregion data;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&data, argp, sizeof(data)))
> > > > +			goto out;
> > > > +
> > > > +		r = kvm_ioregionfd(kvm, &data);
> > > > +		break;
> > > > +	}
> > > >    	case KVM_GET_DIRTY_LOG: {
> > > >    		struct kvm_dirty_log log;
> > > >    
> > > > @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm
> > > > *kvm,
> > > > enum kvm_bus bus_idx, gpa_t addr,
> > > >    	if (!bus)
> > > >    		return -ENOMEM;
> > > >    
> > > > -	/* exclude ioeventfd which is limited by maximum fd */
> > > > -	if (bus->dev_count - bus->ioeventfd_count >
> > > > NR_IOBUS_DEVS - 1)
> > > > -		return -ENOSPC;
> > > > +	/* enforce hard limit if kmemcg is disabled and
> > > > +	 * exclude ioeventfd which is limited by maximum fd
> > > > +	 */
> > > > +	if (!memcg_kmem_enabled())
> > > > +		if (bus->dev_count - bus->ioeventfd_count >
> > > > NR_IOBUS_DEVS - 1)
> > > > +			return -ENOSPC;
> > > >    
> > > >    	new_bus = kmalloc(struct_size(bus, range, bus-
> > > > >dev_count + 1),
> > > >    			  GFP_KERNEL_ACCOUNT);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations
  2021-01-04  5:37       ` Jason Wang
@ 2021-01-05  0:06         ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-01-05  0:06 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Mon, 2021-01-04 at 13:37 +0800, Jason Wang wrote:
> On 2021/1/4 上午4:37, Elena Afanasova wrote:
> > On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote:
> > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > Signed-off-by: Elena Afanasova<eafanasova@gmail.com>
> > > > ---
> > > >    virt/kvm/ioregion.c | 157
> > > > ++++++++++++++++++++++++++++++++++++++++++++
> > > >    1 file changed, 157 insertions(+)
> > > > 
> > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > > > index a200c3761343..8523f4126337 100644
> > > > --- a/virt/kvm/ioregion.c
> > > > +++ b/virt/kvm/ioregion.c
> > > > @@ -4,6 +4,33 @@
> > > >    #include <kvm/iodev.h>
> > > >    #include "eventfd.h"
> > > >    
> > > > +/* Wire protocol */
> > > > +struct ioregionfd_cmd {
> > > > +	__u32 info;
> > > > +	__u32 padding;
> > > > +	__u64 user_data;
> > > > +	__u64 offset;
> > > > +	__u64 data;
> > > > +};
> > > > +
> > > I wonder do we need a seq in the protocol. It might be useful if
> > > we
> > > allow a pair of file descriptors to be used for multiple
> > > different
> > > ranges.
> > > 
> > I think it might be helpful in the case of out-of-order requests.
> > In the case of in order requests seq field seems not to be
> > necessary
> > since there will be cmds/replies serialization. I’ll include the
> > synchronization code in a RFC v2 series.
> 
> See my reply to V1. It might be helpful for the case of using single 
> ioregionfd for multiple ranges.
> 
Ok, thank you!

> Thanks
> 
> 
> > > Thanks
> > > 
> > > 
> > > > +struct ioregionfd_resp {
> > > > +	__u64 data;
> > > > +	__u8 pad[24];
> > > > +};


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-05  0:02         ` Elena Afanasova
@ 2021-01-05  3:53           ` Jason Wang
  2021-01-05 10:25             ` Stefan Hajnoczi
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-05  3:53 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/5 上午8:02, Elena Afanasova wrote:
> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
>>>> How about FAST_MMIO?
>>>>
>>> I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
>>> suitable
>>> for triggers which could use posted writes. The struct
>>> ioregionfd_cmd
>>> size bits and the data field will be unused in this case.
>> Note that eventfd checks for length and have datamatch support. Do
>> we
>> need to do something similar.
>>
> Do you think datamatch support is necessary for ioregionfd?


I'm not sure. But if we don't have this support, it probably means we 
can't use eventfd for ioregionfd.


>
>> I guess the idea is to have a generic interface to let eventfd work
>> for
>> ioregion as well.
>>
> It seems that posted writes is the only "fast" case in ioregionfd. So I
> was thinking about using FAST_MMIO for this case only. Maybe in some
> cases it will be better to just use ioeventfd. But I'm not sure.


To be a generic infrastructure, it's better to have this, but we can 
listen from the opinion of others.


>
>>>>> Guest
>>>>> read and write accesses are dispatched through the given
>>>>> ioregionfd
>>>>> instead of returning from ioctl(KVM_RUN). Regions can be
>>>>> deleted by
>>>>> setting fds to -1.
>>>>>
>>>>> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
>>>>> ---
>>>>>     arch/x86/kvm/Kconfig     |   1 +
>>>>>     arch/x86/kvm/Makefile    |   1 +
>>>>>     arch/x86/kvm/x86.c       |   1 +
>>>>>     include/linux/kvm_host.h |  17 +++
>>>>>     include/uapi/linux/kvm.h |  23 ++++
>>>>>     virt/kvm/Kconfig         |   3 +
>>>>>     virt/kvm/eventfd.c       |  25 +++++
>>>>>     virt/kvm/eventfd.h       |  14 +++
>>>>>     virt/kvm/ioregion.c      | 233
>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>     virt/kvm/ioregion.h      |  15 +++
>>>>>     virt/kvm/kvm_main.c      |  20 +++-
>>>>>     11 files changed, 350 insertions(+), 3 deletions(-)
>>>>>     create mode 100644 virt/kvm/eventfd.h
>>>>>     create mode 100644 virt/kvm/ioregion.c
>>>>>     create mode 100644 virt/kvm/ioregion.h
>>>>>
>>>>> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
>>>>> index f92dfd8ef10d..b914ef375199 100644
>>>>> --- a/arch/x86/kvm/Kconfig
>>>>> +++ b/arch/x86/kvm/Kconfig
>>>>> @@ -33,6 +33,7 @@ config KVM
>>>>>     	select HAVE_KVM_IRQ_BYPASS
>>>>>     	select HAVE_KVM_IRQ_ROUTING
>>>>>     	select HAVE_KVM_EVENTFD
>>>>> +	select KVM_IOREGION
>>>>>     	select KVM_ASYNC_PF
>>>>>     	select USER_RETURN_NOTIFIER
>>>>>     	select KVM_MMIO
>>>>> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
>>>>> index b804444e16d4..b3b17dc9f7d4 100644
>>>>> --- a/arch/x86/kvm/Makefile
>>>>> +++ b/arch/x86/kvm/Makefile
>>>>> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>>>>>     kvm-y			+= $(KVM)/kvm_main.o
>>>>> $(KVM)/coalesced_mmio.o \
>>>>>     				$(KVM)/eventfd.o
>>>>> $(KVM)/irqchip.o
>>>>> $(KVM)/vfio.o
>>>>>     kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>>>>> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>>>>>     
>>>>>     kvm-y			+= x86.o emulate.o i8259.o
>>>>> irq.o
>>>>> lapic.o \
>>>>>     			   i8254.o ioapic.o irq_comm.o cpuid.o
>>>>> pmu.o
>>>>> mtrr.o \
>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>> index e545a8a613b1..ddb28f5ca252 100644
>>>>> --- a/arch/x86/kvm/x86.c
>>>>> +++ b/arch/x86/kvm/x86.c
>>>>> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct
>>>>> kvm
>>>>> *kvm, long ext)
>>>>>     	case KVM_CAP_X86_USER_SPACE_MSR:
>>>>>     	case KVM_CAP_X86_MSR_FILTER:
>>>>>     	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
>>>>> +	case KVM_CAP_IOREGIONFD:
>>>>>     		r = 1;
>>>>>     		break;
>>>>>     	case KVM_CAP_SYNC_REGS:
>>>>> diff --git a/include/linux/kvm_host.h
>>>>> b/include/linux/kvm_host.h
>>>>> index 7f2e2a09ebbd..7cd667dddba9 100644
>>>>> --- a/include/linux/kvm_host.h
>>>>> +++ b/include/linux/kvm_host.h
>>>>> @@ -470,6 +470,10 @@ struct kvm {
>>>>>     		struct mutex      resampler_lock;
>>>>>     	} irqfds;
>>>>>     	struct list_head ioeventfds;
>>>>> +#endif
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +	struct list_head ioregions_mmio;
>>>>> +	struct list_head ioregions_pio;
>>>>>     #endif
>>>>>     	struct kvm_vm_stat stat;
>>>>>     	struct kvm_arch arch;
>>>>> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct
>>>>> kvm
>>>>> *kvm, struct kvm_ioeventfd *args)
>>>>>     
>>>>>     #endif /* CONFIG_HAVE_KVM_EVENTFD */
>>>>>     
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +void kvm_ioregionfd_init(struct kvm *kvm);
>>>>> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion
>>>>> *args);
>>>>> +
>>>>> +#else
>>>>> +
>>>>> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
>>>>> +static inline int kvm_ioregionfd(struct kvm *kvm, struct
>>>>> kvm_ioregion *args)
>>>>> +{
>>>>> +	return -ENOSYS;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>     void kvm_arch_irq_routing_update(struct kvm *kvm);
>>>>>     
>>>>>     static inline void kvm_make_request(int req, struct kvm_vcpu
>>>>> *vcpu)
>>>>> diff --git a/include/uapi/linux/kvm.h
>>>>> b/include/uapi/linux/kvm.h
>>>>> index ca41220b40b8..81e775778c66 100644
>>>>> --- a/include/uapi/linux/kvm.h
>>>>> +++ b/include/uapi/linux/kvm.h
>>>>> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>>>>>     	__u8  pad[36];
>>>>>     };
>>>>>     
>>>>> +enum {
>>>>> +	kvm_ioregion_flag_nr_pio,
>>>>> +	kvm_ioregion_flag_nr_posted_writes,
>>>>> +	kvm_ioregion_flag_nr_max,
>>>>> +};
>>>>> +
>>>>> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
>>>>> +#define KVM_IOREGION_POSTED_WRITES (1 <<
>>>>> kvm_ioregion_flag_nr_posted_writes)
>>>>> +
>>>>> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
>>>>> kvm_ioregion_flag_nr_max) - 1)
>>>>> +
>>>>> +struct kvm_ioregion {
>>>>> +	__u64 guest_paddr; /* guest physical address */
>>>>> +	__u64 memory_size; /* bytes */
>>>>> +	__u64 user_data;
>>>> What will this field do? Is it a token?
>>>>
>>> Yes, it’s an opaque token that can be used by userspace in order to
>>> determine which MemoryRegion to dispatch.
>> This part I don't understand. Userspace should know the fd number
>> (which
>> I guess should be sufficient?).
>>
> I think the user_data field can be useful if same fd is registered with
> multiple GPA ranges.


Yes, but if I read the code correctly, we encode the address in the 
protocol. Isn't it sufficient?


>
>>>>> +	__s32 rfd;
>>>>> +	__s32 wfd;
>>>>> +	__u32 flags;
>>>>> +	__u8  pad[28];
>>>>> +};
>>>> Is this possible to register the same fd with multiple GPA
>>>> ranges?
>>>> If
>>>> not, do we need to check for fd collision?
>>>>
>>> Yes, it’s possible to register the same fd with multiple GPA
>>> ranges.
>>>
>>>>> +
>>>>>     #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>>>>>     #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>>>>>     #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
>>>>> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>>>>>     #define KVM_CAP_X86_USER_SPACE_MSR 188
>>>>>     #define KVM_CAP_X86_MSR_FILTER 189
>>>>>     #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
>>>>> +#define KVM_CAP_IOREGIONFD 191
>>>>>     
>>>>>     #ifdef KVM_CAP_IRQ_ROUTING
>>>>>     
>>>>> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>>>>>     					struct
>>>>> kvm_userspace_memory_region)
>>>>>     #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>>>>>     #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
>>>>> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
>>>>> kvm_ioregion)
>>>>>     
>>>>>     /* enable ucontrol for s390 */
>>>>>     struct kvm_s390_ucas_mapping {
>>>>> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
>>>>> index 1c37ccd5d402..5e6620bbf000 100644
>>>>> --- a/virt/kvm/Kconfig
>>>>> +++ b/virt/kvm/Kconfig
>>>>> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>>>>>            bool
>>>>>            select EVENTFD
>>>>>     
>>>>> +config KVM_IOREGION
>>>>> +       bool
>>>>> +
>>>>>     config KVM_MMIO
>>>>>            bool
>>>>>     
>>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>>> index c2323c27a28b..aadb73903f8b 100644
>>>>> --- a/virt/kvm/eventfd.c
>>>>> +++ b/virt/kvm/eventfd.c
>>>>> @@ -27,6 +27,7 @@
>>>>>     #include <trace/events/kvm.h>
>>>>>     
>>>>>     #include <kvm/iodev.h>
>>>>> +#include "ioregion.h"
>>>>>     
>>>>>     #ifdef CONFIG_HAVE_KVM_IRQFD
>>>>>     
>>>>> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
>>>>> ioeventfd_ops = {
>>>>>     	.destructor = ioeventfd_destructor,
>>>>>     };
>>>>>     
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +/* assumes kvm->slots_lock held */
>>>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
>>>>> +			  u64 start, u64 size)
>>>>> +{
>>>>> +	struct _ioeventfd *_p;
>>>>> +
>>>>> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
>>>>> +		if (_p->bus_idx == bus_idx &&
>>>>> +		    overlap(start, size, _p->addr,
>>>>> +			    !_p->length ? 8 : _p->length))
>>>>> +			return true;
>>>>> +
>>>>> +	return false;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>     /* assumes kvm->slots_lock held */
>>>>>     static bool
>>>>>     ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd
>>>>> *p)
>>>>> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
>>>>> struct _ioeventfd *p)
>>>>>     		       _p->datamatch == p->datamatch))))
>>>>>     			return true;
>>>>>     
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx ==
>>>>> KVM_PIO_BUS)
>>>>> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p-
>>>>>> addr,
>>>>> +					  !p->length ? 8 : p-
>>>>>> length))
>>>>> +			return true;
>>>>> +#endif
>>>>> +
>>>>>     	return false;
>>>>>     }
>>>>>     
>>>>> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
>>>>> new file mode 100644
>>>>> index 000000000000..73a621eebae3
>>>>> --- /dev/null
>>>>> +++ b/virt/kvm/eventfd.h
>>>>> @@ -0,0 +1,14 @@
>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>>> +#ifndef __KVM_EVENTFD_H__
>>>>> +#define __KVM_EVENTFD_H__
>>>>> +
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64
>>>>> start,
>>>>> u64 size);
>>>>> +#else
>>>>> +static inline bool
>>>>> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
>>>>> u64
>>>>> size)
>>>>> +{
>>>>> +	return false;
>>>>> +}
>>>>> +#endif
>>>>> +#endif
>>>>> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
>>>>> new file mode 100644
>>>>> index 000000000000..a200c3761343
>>>>> --- /dev/null
>>>>> +++ b/virt/kvm/ioregion.c
>>>>> @@ -0,0 +1,233 @@
>>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>>> +#include <linux/kvm_host.h>
>>>>> +#include <linux/fs.h>
>>>>> +#include <kvm/iodev.h>
>>>>> +#include "eventfd.h"
>>>>> +
>>>>> +void
>>>>> +kvm_ioregionfd_init(struct kvm *kvm)
>>>>> +{
>>>>> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
>>>>> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
>>>>> +}
>>>>> +
>>>>> +struct ioregion {
>>>>> +	struct list_head     list;
>>>>> +	u64                  paddr;
>>>>> +	u64                  size;
>>>>> +	struct file         *rf;
>>>>> +	struct file         *wf;
>>>>> +	u64                  user_data;
>>>>> +	struct kvm_io_device dev;
>>>>> +	bool                 posted_writes;
>>>>> +};
>>>>> +
>>>>> +static inline struct ioregion *
>>>>> +to_ioregion(struct kvm_io_device *dev)
>>>>> +{
>>>>> +	return container_of(dev, struct ioregion, dev);
>>>>> +}
>>>>> +
>>>>> +/* assumes kvm->slots_lock held */
>>>>> +static void
>>>>> +ioregion_release(struct ioregion *p)
>>>>> +{
>>>>> +	fput(p->rf);
>>>>> +	fput(p->wf);
>>>>> +	list_del(&p->list);
>>>>> +	kfree(p);
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device
>>>>> *this,
>>>>> gpa_t addr,
>>>>> +	      int len, void *val)
>>>>> +{
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device
>>>>> *this,
>>>>> gpa_t addr,
>>>>> +		int len, const void *val)
>>>>> +{
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * This function is called as KVM is completely shutting
>>>>> down.  We
>>>>> do not
>>>>> + * need to worry about locking just nuke anything we have as
>>>>> quickly as possible
>>>>> + */
>>>>> +static void
>>>>> +ioregion_destructor(struct kvm_io_device *this)
>>>>> +{
>>>>> +	struct ioregion *p = to_ioregion(this);
>>>>> +
>>>>> +	ioregion_release(p);
>>>>> +}
>>>>> +
>>>>> +static const struct kvm_io_device_ops ioregion_ops = {
>>>>> +	.read       = ioregion_read,
>>>>> +	.write      = ioregion_write,
>>>>> +	.destructor = ioregion_destructor,
>>>>> +};
>>>>> +
>>>>> +static inline struct list_head *
>>>>> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
>>>>> +{
>>>>> +	return (bus_idx == KVM_MMIO_BUS) ?
>>>>> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
>>>>> +}
>>>>> +
>>>>> +/* check for not overlapping case and reverse */
>>>>> +inline bool
>>>>> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
>>>>> +{
>>>>> +	u64 end1 = start1 + size1 - 1;
>>>>> +	u64 end2 = start2 + size2 - 1;
>>>>> +
>>>>> +	return !(end1 < start2 || start1 >= end2);
>>>>> +}
>>>>> +
>>>>> +/* assumes kvm->slots_lock held */
>>>>> +bool
>>>>> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
>>>>> +		      u64 start, u64 size)
>>>>> +{
>>>>> +	struct ioregion *_p;
>>>>> +	struct list_head *ioregions;
>>>>> +
>>>>> +	ioregions = get_ioregion_list(kvm, bus_idx);
>>>>> +	list_for_each_entry(_p, ioregions, list)
>>>>> +		if (overlap(start, size, _p->paddr, _p->size))
>>>>> +			return true;
>>>>> +
>>>>> +	return false;
>>>>> +}
>>>>> +
>>>>> +/* assumes kvm->slots_lock held */
>>>>> +static bool
>>>>> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
>>>>> kvm_bus bus_idx)
>>>>> +{
>>>>> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p-
>>>>>> size) ||
>>>>> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p-
>>>>>> size))
>>>>> +		return true;
>>>>> +
>>>>> +	return false;
>>>>> +}
>>>>> +
>>>>> +static enum kvm_bus
>>>>> +get_bus_from_flags(__u32 flags)
>>>>> +{
>>>>> +	if (flags & KVM_IOREGION_PIO)
>>>>> +		return KVM_PIO_BUS;
>>>>> +	return KVM_MMIO_BUS;
>>>>> +}
>>>>> +
>>>>> +int
>>>>> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
>>>>> +{
>>>>> +	struct ioregion *p;
>>>>> +	bool is_posted_writes;
>>>>> +	struct file *rfile, *wfile;
>>>>> +	enum kvm_bus bus_idx;
>>>>> +	int ret = 0;
>>>>> +
>>>>> +	if (!args->memory_size)
>>>>> +		return -EINVAL;
>>>>> +	if ((args->guest_paddr + args->memory_size - 1) < args-
>>>>>> guest_paddr)
>>>>> +		return -EINVAL;
>>>>> +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	rfile = fget(args->rfd);
>>>>> +	if (!rfile)
>>>>> +		return -EBADF;
>>>>> +	wfile = fget(args->wfd);
>>>>> +	if (!wfile) {
>>>>> +		fput(rfile);
>>>>> +		return -EBADF;
>>>>> +	}
>>>>> +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
>>>>> O_NONBLOCK)) {
>>>>> +		ret = -EINVAL;
>>>>> +		goto fail;
>>>>> +	}
>>>> Instead of checking nonblocking, can we poll here?
>>>>
>>> Yes, it’s possible. It would be necessary in the case of out-of-
>>> order
>>> requests. But since multiple in-flight messages don’t seem to be a
>>> use
>>> case I’m not sure if it’s necessary. Typically device register
>>> accesses
>>> should not take a long time, so making them asynchronous doesn't
>>> seem
>>> like a practical advantage. Also this might complicate the code and
>>> make it slower. What do you think?
>> One issue I saw is that, if we register a single fd for e.g two
>> regions.
>> And those two regions were read in parallel from guest. It looks to
>> me
>> we don't have any synchronization in the current code.
>>
> Yes, you are right. That’s why there will be cmds/replies serialization
> in a v2 series.


I see.

Thanks


>
>>>>> +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
>>>>> +	if (!p) {
>>>>> +		ret = -ENOMEM;
>>>>> +		goto fail;
>>>>> +	}
>>>>> +
>>>>> +	INIT_LIST_HEAD(&p->list);
>>>>> +	p->paddr = args->guest_paddr;
>>>>> +	p->size = args->memory_size;
>>>>> +	p->user_data = args->user_data;
>>>>> +	p->rf = rfile;
>>>>> +	p->wf = wfile;
>>>>> +	is_posted_writes = args->flags &
>>>>> KVM_IOREGION_POSTED_WRITES;
>>>>> +	p->posted_writes = is_posted_writes ? true : false;
>>>>> +	bus_idx = get_bus_from_flags(args->flags);
>>>>> +
>>>>> +	mutex_lock(&kvm->slots_lock);
>>>>> +
>>>>> +	if (ioregion_collision(kvm, p, bus_idx)) {
>>>>> +		ret = -EEXIST;
>>>>> +		goto unlock_fail;
>>>>> +	}
>>>>> +	kvm_iodevice_init(&p->dev, &ioregion_ops);
>>>>> +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr,
>>>>> p->size,
>>>>> +				      &p->dev);
>>>>> +	if (ret < 0)
>>>>> +		goto unlock_fail;
>>>> We probably need to register to FAST_MMIO when bus_idx is MMIO.
>>>>
>>>>
>>>>> +	list_add_tail(&p->list, get_ioregion_list(kvm,
>>>>> bus_idx));
>>>>> +
>>>>> +	mutex_unlock(&kvm->slots_lock);
>>>>> +
>>>>> +	return 0;
>>>>> +
>>>>> +unlock_fail:
>>>>> +	mutex_unlock(&kvm->slots_lock);
>>>>> +	kfree(p);
>>>>> +fail:
>>>>> +	fput(rfile);
>>>>> +	fput(wfile);
>>>>> +
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
>>>>> +{
>>>>> +	struct ioregion         *p, *tmp;
>>>>> +	enum kvm_bus             bus_idx;
>>>>> +	int                      ret = -ENOENT;
>>>>> +	struct list_head        *ioregions;
>>>>> +
>>>>> +	if (args->rfd != -1 || args->wfd != -1)
>>>>> +		return -EINVAL;
>>>> If we want to use ioregion fd for doorbell, rfd is probably not
>>>> necessary here.
>>>>
>>> This condition is simply a requirement that region can be deleted
>>> in
>>> the case of both fds are set to -1.
>> Ok.
>>
>> Thanks
>>
>>
>>>> Thanks
>>>>
>>>>
>>>>> +
>>>>> +	bus_idx = get_bus_from_flags(args->flags);
>>>>> +	ioregions = get_ioregion_list(kvm, bus_idx);
>>>>> +
>>>>> +	mutex_lock(&kvm->slots_lock);
>>>>> +
>>>>> +	list_for_each_entry_safe(p, tmp, ioregions, list) {
>>>>> +		if (p->paddr == args->guest_paddr  &&
>>>>> +		    p->size == args->memory_size) {
>>>>> +			kvm_io_bus_unregister_dev(kvm, bus_idx,
>>>>> &p-
>>>>>> dev);
>>>>> +			ioregion_release(p);
>>>>> +			ret = 0;
>>>>> +			break;
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	mutex_unlock(&kvm->slots_lock);
>>>>> +
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>> +int
>>>>> +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
>>>>> +{
>>>>> +	if (args->rfd == -1 || args->wfd == -1)
>>>>> +		return kvm_rm_ioregion(kvm, args);
>>>>> +	return kvm_set_ioregion(kvm, args);
>>>>> +}
>>>>> diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
>>>>> new file mode 100644
>>>>> index 000000000000..23ffa812ec7a
>>>>> --- /dev/null
>>>>> +++ b/virt/kvm/ioregion.h
>>>>> @@ -0,0 +1,15 @@
>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>>> +#ifndef __KVM_IOREGION_H__
>>>>> +#define __KVM_IOREGION_H__
>>>>> +
>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>> +inline bool overlap(u64 start1, u64 size1, u64 start2, u64
>>>>> size2);
>>>>> +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
>>>>> start, u64 size);
>>>>> +#else
>>>>> +static inline bool
>>>>> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start,
>>>>> u64
>>>>> size)
>>>>> +{
>>>>> +	return false;
>>>>> +}
>>>>> +#endif
>>>>> +#endif
>>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>>> index 2541a17ff1c4..385d8ec6350d 100644
>>>>> --- a/virt/kvm/kvm_main.c
>>>>> +++ b/virt/kvm/kvm_main.c
>>>>> @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned
>>>>> long
>>>>> type)
>>>>>     	mmgrab(current->mm);
>>>>>     	kvm->mm = current->mm;
>>>>>     	kvm_eventfd_init(kvm);
>>>>> +	kvm_ioregionfd_init(kvm);
>>>>>     	mutex_init(&kvm->lock);
>>>>>     	mutex_init(&kvm->irq_lock);
>>>>>     	mutex_init(&kvm->slots_lock);
>>>>> @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file
>>>>> *filp,
>>>>>     		r = kvm_vm_ioctl_set_memory_region(kvm,
>>>>> &kvm_userspace_mem);
>>>>>     		break;
>>>>>     	}
>>>>> +	case KVM_SET_IOREGION: {
>>>>> +		struct kvm_ioregion data;
>>>>> +
>>>>> +		r = -EFAULT;
>>>>> +		if (copy_from_user(&data, argp, sizeof(data)))
>>>>> +			goto out;
>>>>> +
>>>>> +		r = kvm_ioregionfd(kvm, &data);
>>>>> +		break;
>>>>> +	}
>>>>>     	case KVM_GET_DIRTY_LOG: {
>>>>>     		struct kvm_dirty_log log;
>>>>>     
>>>>> @@ -4301,9 +4312,12 @@ int kvm_io_bus_register_dev(struct kvm
>>>>> *kvm,
>>>>> enum kvm_bus bus_idx, gpa_t addr,
>>>>>     	if (!bus)
>>>>>     		return -ENOMEM;
>>>>>     
>>>>> -	/* exclude ioeventfd which is limited by maximum fd */
>>>>> -	if (bus->dev_count - bus->ioeventfd_count >
>>>>> NR_IOBUS_DEVS - 1)
>>>>> -		return -ENOSPC;
>>>>> +	/* enforce hard limit if kmemcg is disabled and
>>>>> +	 * exclude ioeventfd which is limited by maximum fd
>>>>> +	 */
>>>>> +	if (!memcg_kmem_enabled())
>>>>> +		if (bus->dev_count - bus->ioeventfd_count >
>>>>> NR_IOBUS_DEVS - 1)
>>>>> +			return -ENOSPC;
>>>>>     
>>>>>     	new_bus = kmalloc(struct_size(bus, range, bus-
>>>>>> dev_count + 1),
>>>>>     			  GFP_KERNEL_ACCOUNT);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-05  3:53           ` Jason Wang
@ 2021-01-05 10:25             ` Stefan Hajnoczi
  2021-01-06  5:21               ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-05 10:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 12085 bytes --]

On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
> 
> On 2021/1/5 上午8:02, Elena Afanasova wrote:
> > On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> > > On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > > > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> > > > > How about FAST_MMIO?
> > > > > 
> > > > I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
> > > > suitable
> > > > for triggers which could use posted writes. The struct
> > > > ioregionfd_cmd
> > > > size bits and the data field will be unused in this case.
> > > Note that eventfd checks for length and have datamatch support. Do
> > > we
> > > need to do something similar.
> > > 
> > Do you think datamatch support is necessary for ioregionfd?
> 
> 
> I'm not sure. But if we don't have this support, it probably means we can't
> use eventfd for ioregionfd.

This is an interesting question because ioregionfd and ioeventfd have
different semantics. While it would be great to support all ioeventfd
features in ioregionfd, I'm not sure that is possible. I think ioeventfd
will remain useful for devices that only need a doorbell write register.

The differences:

1. ioeventfd has datamatch. This could be implemented in ioregionfd so
   that a datamatch failure results in the classic ioctl(KVM_RETURN)
   MMIO/PIO exit reason and the VMM can handle the access.

   I'm not sure if this feature is useful though. Most of the time
   ioregionfd users want to handle all accesses to the region and the
   VMM may not even know how to handle register accesses because they
   can only be handled in a dedicated thread or an out-of-process
   device.

2. Write coalescing. ioeventfd combines writes because an eventfd is a
   counter. The counter is incremented on each write and the counter is
   reset to zero by reading the eventfd. This way a slow userspace can
   read the eventfd just once while a fast guest writes to it many times
   (similar to interrupt coalescing in physical hardware). ioregionfd
   cannot really do that, userspace will have to read one struct
   ioregion_cmd per guest access. Elena and I briefly discussed
   optimizing this by implementing a custom struct file_operations so
   the ->read() callback can coalesce multiple writes to the same
   address, but this makes sense mostly for guest write operations in
   FAST_MMIO mode, doesn't allow userspace to provide any type of fd
   (socket, pipe, etc), and increases the complexity.

Keeping in mind that ioeventfd and ioregionfd can be combined, I think
the main advantage to supporting all ioeventfd features in ioregionfd is
uniformity (offering everything through a single interface).

Supporting ioeventfd features in ioregionfd is possible to an extent but
will make ioctl(KVM_SET_IOREGION) more complex and userspace will still
have to create multiple fds because ioeventfd-style doorbell write
registers have different semantics from regular ioregionfd regions
(no posted writes).

My thoughts are that ioregionfd should do what it's good at and
ioeventfd should do what it's good at.

> > > I guess the idea is to have a generic interface to let eventfd work
> > > for
> > > ioregion as well.
> > > 
> > It seems that posted writes is the only "fast" case in ioregionfd. So I
> > was thinking about using FAST_MMIO for this case only. Maybe in some
> > cases it will be better to just use ioeventfd. But I'm not sure.
> 
> 
> To be a generic infrastructure, it's better to have this, but we can listen
> from the opinion of others.

I think we want both FAST_MMIO and regular MMIO options for posted
writes:

1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
   contain information about the nature of the guest access. This is
   fine for ioeventfd doorbell style registers because we don't need
   that information.

2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
   about the nature of the guest access. This is needed when the device
   register is more than a simple "kick" doorbell. For example, if the
   device needs to know the value that the guest wrote.

I suggest defining an additional KVM_SET_IOREGION flag called
KVM_IOREGION_FAST_MMIO that can be set together with
KVM_IOREGION_POSTED_WRITES.

KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.

In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
ioregionfd_resp to acknowledge that the write has been handled.

Read accesses are indistinguishable from write accesses with
KVM_IOREGION_FAST_MMIO so it only makes sense to use the flag on
write-only regions. If the guest performs a read then userspace will see
a write and the destination CPU register will be unchanged (I think this
is already the case for FAST_MMIO).

> > > > > > Guest
> > > > > > read and write accesses are dispatched through the given
> > > > > > ioregionfd
> > > > > > instead of returning from ioctl(KVM_RUN). Regions can be
> > > > > > deleted by
> > > > > > setting fds to -1.
> > > > > > 
> > > > > > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > > > > > ---
> > > > > >     arch/x86/kvm/Kconfig     |   1 +
> > > > > >     arch/x86/kvm/Makefile    |   1 +
> > > > > >     arch/x86/kvm/x86.c       |   1 +
> > > > > >     include/linux/kvm_host.h |  17 +++
> > > > > >     include/uapi/linux/kvm.h |  23 ++++
> > > > > >     virt/kvm/Kconfig         |   3 +
> > > > > >     virt/kvm/eventfd.c       |  25 +++++
> > > > > >     virt/kvm/eventfd.h       |  14 +++
> > > > > >     virt/kvm/ioregion.c      | 233
> > > > > > +++++++++++++++++++++++++++++++++++++++
> > > > > >     virt/kvm/ioregion.h      |  15 +++
> > > > > >     virt/kvm/kvm_main.c      |  20 +++-
> > > > > >     11 files changed, 350 insertions(+), 3 deletions(-)
> > > > > >     create mode 100644 virt/kvm/eventfd.h
> > > > > >     create mode 100644 virt/kvm/ioregion.c
> > > > > >     create mode 100644 virt/kvm/ioregion.h
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > > > > > index f92dfd8ef10d..b914ef375199 100644
> > > > > > --- a/arch/x86/kvm/Kconfig
> > > > > > +++ b/arch/x86/kvm/Kconfig
> > > > > > @@ -33,6 +33,7 @@ config KVM
> > > > > >     	select HAVE_KVM_IRQ_BYPASS
> > > > > >     	select HAVE_KVM_IRQ_ROUTING
> > > > > >     	select HAVE_KVM_EVENTFD
> > > > > > +	select KVM_IOREGION
> > > > > >     	select KVM_ASYNC_PF
> > > > > >     	select USER_RETURN_NOTIFIER
> > > > > >     	select KVM_MMIO
> > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > index b804444e16d4..b3b17dc9f7d4 100644
> > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> > > > > >     kvm-y			+= $(KVM)/kvm_main.o
> > > > > > $(KVM)/coalesced_mmio.o \
> > > > > >     				$(KVM)/eventfd.o
> > > > > > $(KVM)/irqchip.o
> > > > > > $(KVM)/vfio.o
> > > > > >     kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > > > > > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> > > > > >     kvm-y			+= x86.o emulate.o i8259.o
> > > > > > irq.o
> > > > > > lapic.o \
> > > > > >     			   i8254.o ioapic.o irq_comm.o cpuid.o
> > > > > > pmu.o
> > > > > > mtrr.o \
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index e545a8a613b1..ddb28f5ca252 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct
> > > > > > kvm
> > > > > > *kvm, long ext)
> > > > > >     	case KVM_CAP_X86_USER_SPACE_MSR:
> > > > > >     	case KVM_CAP_X86_MSR_FILTER:
> > > > > >     	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > > > > > +	case KVM_CAP_IOREGIONFD:
> > > > > >     		r = 1;
> > > > > >     		break;
> > > > > >     	case KVM_CAP_SYNC_REGS:
> > > > > > diff --git a/include/linux/kvm_host.h
> > > > > > b/include/linux/kvm_host.h
> > > > > > index 7f2e2a09ebbd..7cd667dddba9 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -470,6 +470,10 @@ struct kvm {
> > > > > >     		struct mutex      resampler_lock;
> > > > > >     	} irqfds;
> > > > > >     	struct list_head ioeventfds;
> > > > > > +#endif
> > > > > > +#ifdef CONFIG_KVM_IOREGION
> > > > > > +	struct list_head ioregions_mmio;
> > > > > > +	struct list_head ioregions_pio;
> > > > > >     #endif
> > > > > >     	struct kvm_vm_stat stat;
> > > > > >     	struct kvm_arch arch;
> > > > > > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct
> > > > > > kvm
> > > > > > *kvm, struct kvm_ioeventfd *args)
> > > > > >     #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > > > > +#ifdef CONFIG_KVM_IOREGION
> > > > > > +void kvm_ioregionfd_init(struct kvm *kvm);
> > > > > > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion
> > > > > > *args);
> > > > > > +
> > > > > > +#else
> > > > > > +
> > > > > > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > > > > > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > > > > > kvm_ioregion *args)
> > > > > > +{
> > > > > > +	return -ENOSYS;
> > > > > > +}
> > > > > > +#endif
> > > > > > +
> > > > > >     void kvm_arch_irq_routing_update(struct kvm *kvm);
> > > > > >     static inline void kvm_make_request(int req, struct kvm_vcpu
> > > > > > *vcpu)
> > > > > > diff --git a/include/uapi/linux/kvm.h
> > > > > > b/include/uapi/linux/kvm.h
> > > > > > index ca41220b40b8..81e775778c66 100644
> > > > > > --- a/include/uapi/linux/kvm.h
> > > > > > +++ b/include/uapi/linux/kvm.h
> > > > > > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> > > > > >     	__u8  pad[36];
> > > > > >     };
> > > > > > +enum {
> > > > > > +	kvm_ioregion_flag_nr_pio,
> > > > > > +	kvm_ioregion_flag_nr_posted_writes,
> > > > > > +	kvm_ioregion_flag_nr_max,
> > > > > > +};
> > > > > > +
> > > > > > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > > > > > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > > > > > kvm_ioregion_flag_nr_posted_writes)
> > > > > > +
> > > > > > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > > > > > kvm_ioregion_flag_nr_max) - 1)
> > > > > > +
> > > > > > +struct kvm_ioregion {
> > > > > > +	__u64 guest_paddr; /* guest physical address */
> > > > > > +	__u64 memory_size; /* bytes */
> > > > > > +	__u64 user_data;
> > > > > What will this field do? Is it a token?
> > > > > 
> > > > Yes, it’s an opaque token that can be used by userspace in order to
> > > > determine which MemoryRegion to dispatch.
> > > This part I don't understand. Userspace should know the fd number
> > > (which
> > > I guess should be sufficient?).
> > > 
> > I think the user_data field can be useful if same fd is registered with
> > multiple GPA ranges.
> 
> 
> Yes, but if I read the code correctly, we encode the address in the
> protocol. Isn't it sufficient?

struct ioregionfd_cmd::offset is a relative address from the start of
the ioregion.

The idea is that userspace doesn't need to look up the address. The
kernel has already done that and provided an offset that is relative to
the start of the ioregion that was registered with KVM_SET_IOREGION.

Userspace uses user_data to determine the device/sub-device/region (e.g.
QEMU's MemoryRegion) and passes the offset directly to its
->read()/->write() handler function.

If a userspace program prefers to re-dispatch based on the address then
it can set user_data = guest_paddr, but I think most userspace programs
will prefer to set user_data to a DeviceState (for simple devices with
just one ioregion) or MemoryRegion (for complex devices with multiple
ioregions) pointer instead.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-05 10:25             ` Stefan Hajnoczi
@ 2021-01-06  5:21               ` Jason Wang
  2021-01-06 15:05                 ` Stefan Hajnoczi
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-06  5:21 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
> On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
>> On 2021/1/5 上午8:02, Elena Afanasova wrote:
>>> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>>>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>>>>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
>>>>>> How about FAST_MMIO?
>>>>>>
>>>>> I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
>>>>> suitable
>>>>> for triggers which could use posted writes. The struct
>>>>> ioregionfd_cmd
>>>>> size bits and the data field will be unused in this case.
>>>> Note that eventfd checks for length and have datamatch support. Do
>>>> we
>>>> need to do something similar.
>>>>
>>> Do you think datamatch support is necessary for ioregionfd?
>>
>> I'm not sure. But if we don't have this support, it probably means we can't
>> use eventfd for ioregionfd.
> This is an interesting question because ioregionfd and ioeventfd have
> different semantics. While it would be great to support all ioeventfd
> features in ioregionfd, I'm not sure that is possible. I think ioeventfd
> will remain useful for devices that only need a doorbell write register.
>
> The differences:
>
> 1. ioeventfd has datamatch. This could be implemented in ioregionfd so
>     that a datamatch failure results in the classic ioctl(KVM_RETURN)
>     MMIO/PIO exit reason and the VMM can handle the access.
>
>     I'm not sure if this feature is useful though. Most of the time
>     ioregionfd users want to handle all accesses to the region and the
>     VMM may not even know how to handle register accesses because they
>     can only be handled in a dedicated thread or an out-of-process
>     device.


It's about whether or not the current semantic of ioregion is sufficient 
for implementing doorbell.

E.g in the case of virtio, the virtqueue index is encoded in the write 
to the doorbell. And if a single MMIO area is used for all virtqueues, 
datamatch is probably a must in this case.


>
> 2. Write coalescing. ioeventfd combines writes because an eventfd is a
>     counter. The counter is incremented on each write and the counter is
>     reset to zero by reading the eventfd. This way a slow userspace can
>     read the eventfd just once while a fast guest writes to it many times
>     (similar to interrupt coalescing in physical hardware). ioregionfd
>     cannot really do that, userspace will have to read one struct
>     ioregion_cmd per guest access. Elena and I briefly discussed
>     optimizing this by implementing a custom struct file_operations so
>     the ->read() callback can coalesce multiple writes to the same
>     address, but this makes sense mostly for guest write operations in
>     FAST_MMIO mode, doesn't allow userspace to provide any type of fd
>     (socket, pipe, etc), and increases the complexity.


So that's why I suggest to use eBPF instead of a hard-coding a protocol 
(in the future). We wouldn't have any limitation then.


>
> Keeping in mind that ioeventfd and ioregionfd can be combined,


That's the motivation of my question :) Use eventfd + ioregreionfd


>   I think
> the main advantage to supporting all ioeventfd features in ioregionfd is
> uniformity (offering everything through a single interface).


Yes.


>
> Supporting ioeventfd features in ioregionfd is possible to an extent but
> will make ioctl(KVM_SET_IOREGION) more complex and userspace will still
> have to create multiple fds because ioeventfd-style doorbell write
> registers have different semantics from regular ioregionfd regions
> (no posted writes).
>
> My thoughts are that ioregionfd should do what it's good at and
> ioeventfd should do what it's good at.


Fine with me, and we can leave the rest for the future eBPF extension 
for ioregionfd.


>
>>>> I guess the idea is to have a generic interface to let eventfd work
>>>> for
>>>> ioregion as well.
>>>>
>>> It seems that posted writes is the only "fast" case in ioregionfd. So I
>>> was thinking about using FAST_MMIO for this case only. Maybe in some
>>> cases it will be better to just use ioeventfd. But I'm not sure.
>>
>> To be a generic infrastructure, it's better to have this, but we can listen
>> from the opinion of others.
> I think we want both FAST_MMIO and regular MMIO options for posted
> writes:
>
> 1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
>     contain information about the nature of the guest access. This is
>     fine for ioeventfd doorbell style registers because we don't need
>     that information.


Is FAST_MMIO always for doorbell? If not, we probably need the size and 
data.


>
> 2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
>     about the nature of the guest access. This is needed when the device
>     register is more than a simple "kick" doorbell. For example, if the
>     device needs to know the value that the guest wrote.
>
> I suggest defining an additional KVM_SET_IOREGION flag called
> KVM_IOREGION_FAST_MMIO that can be set together with
> KVM_IOREGION_POSTED_WRITES.


If we need to expose FAST_MMIO to userspace, we probably need to define 
its semantics which is probably not easy since it's an architecture 
optimization.


>
> KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.
>
> In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
> KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
> ioregionfd_resp to acknowledge that the write has been handled.


Right, and it also depends on whether or not the hardware support (e.g 
whether or not it can decode the instructions).


>
> Read accesses are indistinguishable from write accesses with
> KVM_IOREGION_FAST_MMIO so it only makes sense to use the flag on
> write-only regions. If the guest performs a read then userspace will see
> a write and the destination CPU register will be unchanged (I think this
> is already the case for FAST_MMIO).


Yes.

Thanks


>
>>>>>>> Guest
>>>>>>> read and write accesses are dispatched through the given
>>>>>>> ioregionfd
>>>>>>> instead of returning from ioctl(KVM_RUN). Regions can be
>>>>>>> deleted by
>>>>>>> setting fds to -1.
>>>>>>>
>>>>>>> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
>>>>>>> ---
>>>>>>>      arch/x86/kvm/Kconfig     |   1 +
>>>>>>>      arch/x86/kvm/Makefile    |   1 +
>>>>>>>      arch/x86/kvm/x86.c       |   1 +
>>>>>>>      include/linux/kvm_host.h |  17 +++
>>>>>>>      include/uapi/linux/kvm.h |  23 ++++
>>>>>>>      virt/kvm/Kconfig         |   3 +
>>>>>>>      virt/kvm/eventfd.c       |  25 +++++
>>>>>>>      virt/kvm/eventfd.h       |  14 +++
>>>>>>>      virt/kvm/ioregion.c      | 233
>>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>>>      virt/kvm/ioregion.h      |  15 +++
>>>>>>>      virt/kvm/kvm_main.c      |  20 +++-
>>>>>>>      11 files changed, 350 insertions(+), 3 deletions(-)
>>>>>>>      create mode 100644 virt/kvm/eventfd.h
>>>>>>>      create mode 100644 virt/kvm/ioregion.c
>>>>>>>      create mode 100644 virt/kvm/ioregion.h
>>>>>>>
>>>>>>> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
>>>>>>> index f92dfd8ef10d..b914ef375199 100644
>>>>>>> --- a/arch/x86/kvm/Kconfig
>>>>>>> +++ b/arch/x86/kvm/Kconfig
>>>>>>> @@ -33,6 +33,7 @@ config KVM
>>>>>>>      	select HAVE_KVM_IRQ_BYPASS
>>>>>>>      	select HAVE_KVM_IRQ_ROUTING
>>>>>>>      	select HAVE_KVM_EVENTFD
>>>>>>> +	select KVM_IOREGION
>>>>>>>      	select KVM_ASYNC_PF
>>>>>>>      	select USER_RETURN_NOTIFIER
>>>>>>>      	select KVM_MMIO
>>>>>>> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
>>>>>>> index b804444e16d4..b3b17dc9f7d4 100644
>>>>>>> --- a/arch/x86/kvm/Makefile
>>>>>>> +++ b/arch/x86/kvm/Makefile
>>>>>>> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>>>>>>>      kvm-y			+= $(KVM)/kvm_main.o
>>>>>>> $(KVM)/coalesced_mmio.o \
>>>>>>>      				$(KVM)/eventfd.o
>>>>>>> $(KVM)/irqchip.o
>>>>>>> $(KVM)/vfio.o
>>>>>>>      kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>>>>>>> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>>>>>>>      kvm-y			+= x86.o emulate.o i8259.o
>>>>>>> irq.o
>>>>>>> lapic.o \
>>>>>>>      			   i8254.o ioapic.o irq_comm.o cpuid.o
>>>>>>> pmu.o
>>>>>>> mtrr.o \
>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>>>> index e545a8a613b1..ddb28f5ca252 100644
>>>>>>> --- a/arch/x86/kvm/x86.c
>>>>>>> +++ b/arch/x86/kvm/x86.c
>>>>>>> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct
>>>>>>> kvm
>>>>>>> *kvm, long ext)
>>>>>>>      	case KVM_CAP_X86_USER_SPACE_MSR:
>>>>>>>      	case KVM_CAP_X86_MSR_FILTER:
>>>>>>>      	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
>>>>>>> +	case KVM_CAP_IOREGIONFD:
>>>>>>>      		r = 1;
>>>>>>>      		break;
>>>>>>>      	case KVM_CAP_SYNC_REGS:
>>>>>>> diff --git a/include/linux/kvm_host.h
>>>>>>> b/include/linux/kvm_host.h
>>>>>>> index 7f2e2a09ebbd..7cd667dddba9 100644
>>>>>>> --- a/include/linux/kvm_host.h
>>>>>>> +++ b/include/linux/kvm_host.h
>>>>>>> @@ -470,6 +470,10 @@ struct kvm {
>>>>>>>      		struct mutex      resampler_lock;
>>>>>>>      	} irqfds;
>>>>>>>      	struct list_head ioeventfds;
>>>>>>> +#endif
>>>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>>>> +	struct list_head ioregions_mmio;
>>>>>>> +	struct list_head ioregions_pio;
>>>>>>>      #endif
>>>>>>>      	struct kvm_vm_stat stat;
>>>>>>>      	struct kvm_arch arch;
>>>>>>> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct
>>>>>>> kvm
>>>>>>> *kvm, struct kvm_ioeventfd *args)
>>>>>>>      #endif /* CONFIG_HAVE_KVM_EVENTFD */
>>>>>>> +#ifdef CONFIG_KVM_IOREGION
>>>>>>> +void kvm_ioregionfd_init(struct kvm *kvm);
>>>>>>> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion
>>>>>>> *args);
>>>>>>> +
>>>>>>> +#else
>>>>>>> +
>>>>>>> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
>>>>>>> +static inline int kvm_ioregionfd(struct kvm *kvm, struct
>>>>>>> kvm_ioregion *args)
>>>>>>> +{
>>>>>>> +	return -ENOSYS;
>>>>>>> +}
>>>>>>> +#endif
>>>>>>> +
>>>>>>>      void kvm_arch_irq_routing_update(struct kvm *kvm);
>>>>>>>      static inline void kvm_make_request(int req, struct kvm_vcpu
>>>>>>> *vcpu)
>>>>>>> diff --git a/include/uapi/linux/kvm.h
>>>>>>> b/include/uapi/linux/kvm.h
>>>>>>> index ca41220b40b8..81e775778c66 100644
>>>>>>> --- a/include/uapi/linux/kvm.h
>>>>>>> +++ b/include/uapi/linux/kvm.h
>>>>>>> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>>>>>>>      	__u8  pad[36];
>>>>>>>      };
>>>>>>> +enum {
>>>>>>> +	kvm_ioregion_flag_nr_pio,
>>>>>>> +	kvm_ioregion_flag_nr_posted_writes,
>>>>>>> +	kvm_ioregion_flag_nr_max,
>>>>>>> +};
>>>>>>> +
>>>>>>> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
>>>>>>> +#define KVM_IOREGION_POSTED_WRITES (1 <<
>>>>>>> kvm_ioregion_flag_nr_posted_writes)
>>>>>>> +
>>>>>>> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
>>>>>>> kvm_ioregion_flag_nr_max) - 1)
>>>>>>> +
>>>>>>> +struct kvm_ioregion {
>>>>>>> +	__u64 guest_paddr; /* guest physical address */
>>>>>>> +	__u64 memory_size; /* bytes */
>>>>>>> +	__u64 user_data;
>>>>>> What will this field do? Is it a token?
>>>>>>
>>>>> Yes, it’s an opaque token that can be used by userspace in order to
>>>>> determine which MemoryRegion to dispatch.
>>>> This part I don't understand. Userspace should know the fd number
>>>> (which
>>>> I guess should be sufficient?).
>>>>
>>> I think the user_data field can be useful if same fd is registered with
>>> multiple GPA ranges.
>>
>> Yes, but if I read the code correctly, we encode the address in the
>> protocol. Isn't it sufficient?
> struct ioregionfd_cmd::offset is a relative address from the start of
> the ioregion.
>
> The idea is that userspace doesn't need to look up the address. The
> kernel has already done that and provided an offset that is relative to
> the start of the ioregion that was registered with KVM_SET_IOREGION.
>
> Userspace uses user_data to determine the device/sub-device/region (e.g.
> QEMU's MemoryRegion) and passes the offset directly to its
> ->read()/->write() handler function.
>
> If a userspace program prefers to re-dispatch based on the address then
> it can set user_data = guest_paddr, but I think most userspace programs
> will prefer to set user_data to a DeviceState (for simple devices with
> just one ioregion) or MemoryRegion (for complex devices with multiple
> ioregions) pointer instead.
>
> Stefan


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-06  5:21               ` Jason Wang
@ 2021-01-06 15:05                 ` Stefan Hajnoczi
  2021-01-07  3:30                   ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-06 15:05 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 5540 bytes --]

On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
> 
> On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
> > On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
> > > On 2021/1/5 上午8:02, Elena Afanasova wrote:
> > > > On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> > > > > On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > > > > > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > > > > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > > > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> > > > > > > How about FAST_MMIO?
> > > > > > > 
> > > > > > I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
> > > > > > suitable
> > > > > > for triggers which could use posted writes. The struct
> > > > > > ioregionfd_cmd
> > > > > > size bits and the data field will be unused in this case.
> > > > > Note that eventfd checks for length and have datamatch support. Do
> > > > > we
> > > > > need to do something similar.
> > > > > 
> > > > Do you think datamatch support is necessary for ioregionfd?
> > > 
> > > I'm not sure. But if we don't have this support, it probably means we can't
> > > use eventfd for ioregionfd.
> > This is an interesting question because ioregionfd and ioeventfd have
> > different semantics. While it would be great to support all ioeventfd
> > features in ioregionfd, I'm not sure that is possible. I think ioeventfd
> > will remain useful for devices that only need a doorbell write register.
> > 
> > The differences:
> > 
> > 1. ioeventfd has datamatch. This could be implemented in ioregionfd so
> >     that a datamatch failure results in the classic ioctl(KVM_RETURN)
> >     MMIO/PIO exit reason and the VMM can handle the access.
> > 
> >     I'm not sure if this feature is useful though. Most of the time
> >     ioregionfd users want to handle all accesses to the region and the
> >     VMM may not even know how to handle register accesses because they
> >     can only be handled in a dedicated thread or an out-of-process
> >     device.
> 
> 
> It's about whether or not the current semantic of ioregion is sufficient for
> implementing doorbell.
> 
> E.g in the case of virtio, the virtqueue index is encoded in the write to
> the doorbell. And if a single MMIO area is used for all virtqueues,
> datamatch is probably a must in this case.

struct ioregionfd_cmd contains not just the register offset, but also
the value written by the guest. Therefore datamatch is not necessary.

Datamatch would only be useful as some kind of more complex optimization
where different values writtent to the same register dispatch to
different fds.

> > > > > I guess the idea is to have a generic interface to let eventfd work
> > > > > for
> > > > > ioregion as well.
> > > > > 
> > > > It seems that posted writes is the only "fast" case in ioregionfd. So I
> > > > was thinking about using FAST_MMIO for this case only. Maybe in some
> > > > cases it will be better to just use ioeventfd. But I'm not sure.
> > > 
> > > To be a generic infrastructure, it's better to have this, but we can listen
> > > from the opinion of others.
> > I think we want both FAST_MMIO and regular MMIO options for posted
> > writes:
> > 
> > 1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
> >     contain information about the nature of the guest access. This is
> >     fine for ioeventfd doorbell style registers because we don't need
> >     that information.
> 
> 
> Is FAST_MMIO always for doorbell? If not, we probably need the size and
> data.

My understanding is that FAST_MMIO only provides the guest physical
address and no additional information. In fact, I'm not even sure if we
know whether the access is a read or a write.

So there is extremely limited information to work with and it's
basically only useful for doorbell writes.

> > 2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
> >     about the nature of the guest access. This is needed when the device
> >     register is more than a simple "kick" doorbell. For example, if the
> >     device needs to know the value that the guest wrote.
> > 
> > I suggest defining an additional KVM_SET_IOREGION flag called
> > KVM_IOREGION_FAST_MMIO that can be set together with
> > KVM_IOREGION_POSTED_WRITES.
> 
> 
> If we need to expose FAST_MMIO to userspace, we probably need to define its
> semantics which is probably not easy since it's an architecture
> optimization.

Maybe the name KVM_IOREGION_FAST_MMIO name should be changed to
something more specific like KVM_IOREGION_OFFSET_ONLY, meaning that only
the offset field is valid.

I haven't checked if and how other architectures implement FAST_MMIO,
but they will at least be able to provide the offset :).

> > KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.
> > 
> > In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
> > KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
> > ioregionfd_resp to acknowledge that the write has been handled.
> 
> 
> Right, and it also depends on whether or not the hardware support (e.g
> whether or not it can decode the instructions).

The KVM_IOREGION_FAST_MMIO flag should be documented as an optimization
hint. If hardware doesn't support FAST_MMIO then struct ioregionfd_cmd
will contain all fields. Userspace will be able to process the cmd
either way.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-06 15:05                 ` Stefan Hajnoczi
@ 2021-01-07  3:30                   ` Jason Wang
  2021-01-07 17:53                     ` Stefan Hajnoczi
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-07  3:30 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
> On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
>> On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
>>> On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
>>>> On 2021/1/5 上午8:02, Elena Afanasova wrote:
>>>>> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>>>>>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>>>>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>>>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>>>>>>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
>>>>>>>> How about FAST_MMIO?
>>>>>>>>
>>>>>>> I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
>>>>>>> suitable
>>>>>>> for triggers which could use posted writes. The struct
>>>>>>> ioregionfd_cmd
>>>>>>> size bits and the data field will be unused in this case.
>>>>>> Note that eventfd checks for length and have datamatch support. Do
>>>>>> we
>>>>>> need to do something similar.
>>>>>>
>>>>> Do you think datamatch support is necessary for ioregionfd?
>>>> I'm not sure. But if we don't have this support, it probably means we can't
>>>> use eventfd for ioregionfd.
>>> This is an interesting question because ioregionfd and ioeventfd have
>>> different semantics. While it would be great to support all ioeventfd
>>> features in ioregionfd, I'm not sure that is possible. I think ioeventfd
>>> will remain useful for devices that only need a doorbell write register.
>>>
>>> The differences:
>>>
>>> 1. ioeventfd has datamatch. This could be implemented in ioregionfd so
>>>      that a datamatch failure results in the classic ioctl(KVM_RETURN)
>>>      MMIO/PIO exit reason and the VMM can handle the access.
>>>
>>>      I'm not sure if this feature is useful though. Most of the time
>>>      ioregionfd users want to handle all accesses to the region and the
>>>      VMM may not even know how to handle register accesses because they
>>>      can only be handled in a dedicated thread or an out-of-process
>>>      device.
>>
>> It's about whether or not the current semantic of ioregion is sufficient for
>> implementing doorbell.
>>
>> E.g in the case of virtio, the virtqueue index is encoded in the write to
>> the doorbell. And if a single MMIO area is used for all virtqueues,
>> datamatch is probably a must in this case.
> struct ioregionfd_cmd contains not just the register offset, but also
> the value written by the guest. Therefore datamatch is not necessary.
>
> Datamatch would only be useful as some kind of more complex optimization
> where different values writtent to the same register dispatch to
> different fds.


That's exactly the case of virtio. Consider queue 1,2 shares the MMIO 
register. We need use datamatch to dispatch the notification to 
different eventfds.


>
>>>>>> I guess the idea is to have a generic interface to let eventfd work
>>>>>> for
>>>>>> ioregion as well.
>>>>>>
>>>>> It seems that posted writes is the only "fast" case in ioregionfd. So I
>>>>> was thinking about using FAST_MMIO for this case only. Maybe in some
>>>>> cases it will be better to just use ioeventfd. But I'm not sure.
>>>> To be a generic infrastructure, it's better to have this, but we can listen
>>>> from the opinion of others.
>>> I think we want both FAST_MMIO and regular MMIO options for posted
>>> writes:
>>>
>>> 1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
>>>      contain information about the nature of the guest access. This is
>>>      fine for ioeventfd doorbell style registers because we don't need
>>>      that information.
>>
>> Is FAST_MMIO always for doorbell? If not, we probably need the size and
>> data.
> My understanding is that FAST_MMIO only provides the guest physical
> address and no additional information. In fact, I'm not even sure if we
> know whether the access is a read or a write.
>
> So there is extremely limited information to work with and it's
> basically only useful for doorbell writes.
>
>>> 2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
>>>      about the nature of the guest access. This is needed when the device
>>>      register is more than a simple "kick" doorbell. For example, if the
>>>      device needs to know the value that the guest wrote.
>>>
>>> I suggest defining an additional KVM_SET_IOREGION flag called
>>> KVM_IOREGION_FAST_MMIO that can be set together with
>>> KVM_IOREGION_POSTED_WRITES.
>>
>> If we need to expose FAST_MMIO to userspace, we probably need to define its
>> semantics which is probably not easy since it's an architecture
>> optimization.
> Maybe the name KVM_IOREGION_FAST_MMIO name should be changed to
> something more specific like KVM_IOREGION_OFFSET_ONLY, meaning that only
> the offset field is valid.


Or we can do like what eventfd did, implies FAST_MMIO when memory_size 
is zero (kvm_assign_ioeventfd()):

     if (!args->len && bus_idx == KVM_MMIO_BUS) {
         ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
         if (ret < 0)
             goto fast_fail;
     }


>
> I haven't checked if and how other architectures implement FAST_MMIO,
> but they will at least be able to provide the offset :).
>
>>> KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.
>>>
>>> In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
>>> KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
>>> ioregionfd_resp to acknowledge that the write has been handled.
>>
>> Right, and it also depends on whether or not the hardware support (e.g
>> whether or not it can decode the instructions).
> The KVM_IOREGION_FAST_MMIO flag should be documented as an optimization
> hint. If hardware doesn't support FAST_MMIO then struct ioregionfd_cmd
> will contain all fields. Userspace will be able to process the cmd
> either way.


You mean always have a fallback to MMIO for FAST_MMIO? That should be 
fine but looks less optimal than the implying FAST_MMIO for zero length. 
I still think introducing "FAST_MMIO" may bring confusion for users ...

Thanks


>
> Stefan


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-07  3:30                   ` Jason Wang
@ 2021-01-07 17:53                     ` Stefan Hajnoczi
  2021-01-13  2:38                       ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-07 17:53 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 8436 bytes --]

On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
> 
> On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
> > On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
> > > On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
> > > > On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
> > > > > On 2021/1/5 上午8:02, Elena Afanasova wrote:
> > > > > > On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> > > > > > > On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > > > > > > > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > > > > > > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > > > > > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> > > > > > > > > How about FAST_MMIO?
> > > > > > > > > 
> > > > > > > > I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
> > > > > > > > suitable
> > > > > > > > for triggers which could use posted writes. The struct
> > > > > > > > ioregionfd_cmd
> > > > > > > > size bits and the data field will be unused in this case.
> > > > > > > Note that eventfd checks for length and have datamatch support. Do
> > > > > > > we
> > > > > > > need to do something similar.
> > > > > > > 
> > > > > > Do you think datamatch support is necessary for ioregionfd?
> > > > > I'm not sure. But if we don't have this support, it probably means we can't
> > > > > use eventfd for ioregionfd.
> > > > This is an interesting question because ioregionfd and ioeventfd have
> > > > different semantics. While it would be great to support all ioeventfd
> > > > features in ioregionfd, I'm not sure that is possible. I think ioeventfd
> > > > will remain useful for devices that only need a doorbell write register.
> > > > 
> > > > The differences:
> > > > 
> > > > 1. ioeventfd has datamatch. This could be implemented in ioregionfd so
> > > >      that a datamatch failure results in the classic ioctl(KVM_RETURN)
> > > >      MMIO/PIO exit reason and the VMM can handle the access.
> > > > 
> > > >      I'm not sure if this feature is useful though. Most of the time
> > > >      ioregionfd users want to handle all accesses to the region and the
> > > >      VMM may not even know how to handle register accesses because they
> > > >      can only be handled in a dedicated thread or an out-of-process
> > > >      device.
> > > 
> > > It's about whether or not the current semantic of ioregion is sufficient for
> > > implementing doorbell.
> > > 
> > > E.g in the case of virtio, the virtqueue index is encoded in the write to
> > > the doorbell. And if a single MMIO area is used for all virtqueues,
> > > datamatch is probably a must in this case.
> > struct ioregionfd_cmd contains not just the register offset, but also
> > the value written by the guest. Therefore datamatch is not necessary.
> > 
> > Datamatch would only be useful as some kind of more complex optimization
> > where different values writtent to the same register dispatch to
> > different fds.
> 
> 
> That's exactly the case of virtio. Consider queue 1,2 shares the MMIO
> register. We need use datamatch to dispatch the notification to different
> eventfds.

I can see two options without datamatch:

1. If both virtqueues are handled by the same userspace thread then only
   1 fd is needed. ioregionfd sends the value written to the register,
   so userspace is able to distinguish between the virtqueues.

2. If separate userspace threads process the virtqueues, then set up the
   virtio-pci capabilities so the virtqueues have separate notification
   registers:
   https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004

With ioeventfd 2 fds are needed in case #1 because the data value
written to the register is not communicated to userspace. But ioregionfd
does not have this limitation, so I'm not sure whether datamatch is
really needed in ioregionfd?

Or is there a use case that I missed?

> > > > > > > I guess the idea is to have a generic interface to let eventfd work
> > > > > > > for
> > > > > > > ioregion as well.
> > > > > > > 
> > > > > > It seems that posted writes is the only "fast" case in ioregionfd. So I
> > > > > > was thinking about using FAST_MMIO for this case only. Maybe in some
> > > > > > cases it will be better to just use ioeventfd. But I'm not sure.
> > > > > To be a generic infrastructure, it's better to have this, but we can listen
> > > > > from the opinion of others.
> > > > I think we want both FAST_MMIO and regular MMIO options for posted
> > > > writes:
> > > > 
> > > > 1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
> > > >      contain information about the nature of the guest access. This is
> > > >      fine for ioeventfd doorbell style registers because we don't need
> > > >      that information.
> > > 
> > > Is FAST_MMIO always for doorbell? If not, we probably need the size and
> > > data.
> > My understanding is that FAST_MMIO only provides the guest physical
> > address and no additional information. In fact, I'm not even sure if we
> > know whether the access is a read or a write.
> > 
> > So there is extremely limited information to work with and it's
> > basically only useful for doorbell writes.
> > 
> > > > 2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
> > > >      about the nature of the guest access. This is needed when the device
> > > >      register is more than a simple "kick" doorbell. For example, if the
> > > >      device needs to know the value that the guest wrote.
> > > > 
> > > > I suggest defining an additional KVM_SET_IOREGION flag called
> > > > KVM_IOREGION_FAST_MMIO that can be set together with
> > > > KVM_IOREGION_POSTED_WRITES.
> > > 
> > > If we need to expose FAST_MMIO to userspace, we probably need to define its
> > > semantics which is probably not easy since it's an architecture
> > > optimization.
> > Maybe the name KVM_IOREGION_FAST_MMIO name should be changed to
> > something more specific like KVM_IOREGION_OFFSET_ONLY, meaning that only
> > the offset field is valid.
> 
> 
> Or we can do like what eventfd did, implies FAST_MMIO when memory_size is
> zero (kvm_assign_ioeventfd()):
> 
>     if (!args->len && bus_idx == KVM_MMIO_BUS) {
>         ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
>         if (ret < 0)
>             goto fast_fail;
>     }

Yes!

> > I haven't checked if and how other architectures implement FAST_MMIO,
> > but they will at least be able to provide the offset :).
> > 
> > > > KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.
> > > > 
> > > > In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
> > > > KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
> > > > ioregionfd_resp to acknowledge that the write has been handled.
> > > 
> > > Right, and it also depends on whether or not the hardware support (e.g
> > > whether or not it can decode the instructions).
> > The KVM_IOREGION_FAST_MMIO flag should be documented as an optimization
> > hint. If hardware doesn't support FAST_MMIO then struct ioregionfd_cmd
> > will contain all fields. Userspace will be able to process the cmd
> > either way.
> 
> 
> You mean always have a fallback to MMIO for FAST_MMIO? That should be fine
> but looks less optimal than the implying FAST_MMIO for zero length. I still
> think introducing "FAST_MMIO" may bring confusion for users ...

Regarding the fallback, my understanding is that ioeventfds are always
placed on both the MMIO and FAST_MMIO bus when len is zero. That way
architectures that don't support FAST_MMIO will still dispatch those
ioeventfds. In virt/kvm/eventfd.c:kvm_assign_ioeventfd():

  ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
  ...
  if (!args->len && bus_idx == KVM_MMIO_BUS) {
      ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);

So ioeventfd is already doing this fallback thing.

Let's follow ioeventfd:
1. len=0 means the size/data fields are not needed. Userspace cannot
   rely on these fields being valid.
2. There is an automatic fallback to the slow MMIO bus so that slow path
   accesses are still detected by the ioregion.

The explicit KVM_IOREGION_FAST_MMIO flag I mentioned is not needed.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-07 17:53                     ` Stefan Hajnoczi
@ 2021-01-13  2:38                       ` Jason Wang
  2021-01-13 15:52                         ` Stefan Hajnoczi
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-13  2:38 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/1/8 上午1:53, Stefan Hajnoczi wrote:
> On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
>> On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
>>> On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
>>>> On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
>>>>> On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
>>>>>> On 2021/1/5 上午8:02, Elena Afanasova wrote:
>>>>>>> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>>>>>>>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>>>>>>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>>>>>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>>>>>>>>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
>>>>>>>>>> How about FAST_MMIO?
>>>>>>>>>>
>>>>>>>>> I’ll add KVM_IOREGION_FAST_MMIO flag support. So this may be
>>>>>>>>> suitable
>>>>>>>>> for triggers which could use posted writes. The struct
>>>>>>>>> ioregionfd_cmd
>>>>>>>>> size bits and the data field will be unused in this case.
>>>>>>>> Note that eventfd checks for length and have datamatch support. Do
>>>>>>>> we
>>>>>>>> need to do something similar.
>>>>>>>>
>>>>>>> Do you think datamatch support is necessary for ioregionfd?
>>>>>> I'm not sure. But if we don't have this support, it probably means we can't
>>>>>> use eventfd for ioregionfd.
>>>>> This is an interesting question because ioregionfd and ioeventfd have
>>>>> different semantics. While it would be great to support all ioeventfd
>>>>> features in ioregionfd, I'm not sure that is possible. I think ioeventfd
>>>>> will remain useful for devices that only need a doorbell write register.
>>>>>
>>>>> The differences:
>>>>>
>>>>> 1. ioeventfd has datamatch. This could be implemented in ioregionfd so
>>>>>       that a datamatch failure results in the classic ioctl(KVM_RETURN)
>>>>>       MMIO/PIO exit reason and the VMM can handle the access.
>>>>>
>>>>>       I'm not sure if this feature is useful though. Most of the time
>>>>>       ioregionfd users want to handle all accesses to the region and the
>>>>>       VMM may not even know how to handle register accesses because they
>>>>>       can only be handled in a dedicated thread or an out-of-process
>>>>>       device.
>>>> It's about whether or not the current semantic of ioregion is sufficient for
>>>> implementing doorbell.
>>>>
>>>> E.g in the case of virtio, the virtqueue index is encoded in the write to
>>>> the doorbell. And if a single MMIO area is used for all virtqueues,
>>>> datamatch is probably a must in this case.
>>> struct ioregionfd_cmd contains not just the register offset, but also
>>> the value written by the guest. Therefore datamatch is not necessary.
>>>
>>> Datamatch would only be useful as some kind of more complex optimization
>>> where different values writtent to the same register dispatch to
>>> different fds.
>>
>> That's exactly the case of virtio. Consider queue 1,2 shares the MMIO
>> register. We need use datamatch to dispatch the notification to different
>> eventfds.
> I can see two options without datamatch:
>
> 1. If both virtqueues are handled by the same userspace thread then only
>     1 fd is needed. ioregionfd sends the value written to the register,
>     so userspace is able to distinguish between the virtqueues.


Right.


>
> 2. If separate userspace threads process the virtqueues, then set up the
>     virtio-pci capabilities so the virtqueues have separate notification
>     registers:
>     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004


Right. But this works only when PCI transport is used and queue index 
could be deduced from the register address (separated doorbell).

If we use MMIO or sharing the doorbell registers among all the 
virtqueues (multiplexer is zero in the above case) , it can't work 
without datamatch.


>
> With ioeventfd 2 fds are needed in case #1 because the data value
> written to the register is not communicated to userspace. But ioregionfd
> does not have this limitation, so I'm not sure whether datamatch is
> really needed in ioregionfd?
>
> Or is there a use case that I missed?
>
>>>>>>>> I guess the idea is to have a generic interface to let eventfd work
>>>>>>>> for
>>>>>>>> ioregion as well.
>>>>>>>>
>>>>>>> It seems that posted writes is the only "fast" case in ioregionfd. So I
>>>>>>> was thinking about using FAST_MMIO for this case only. Maybe in some
>>>>>>> cases it will be better to just use ioeventfd. But I'm not sure.
>>>>>> To be a generic infrastructure, it's better to have this, but we can listen
>>>>>> from the opinion of others.
>>>>> I think we want both FAST_MMIO and regular MMIO options for posted
>>>>> writes:
>>>>>
>>>>> 1. FAST_MMIO - ioregionfd_cmd size and data fields are zero and do not
>>>>>       contain information about the nature of the guest access. This is
>>>>>       fine for ioeventfd doorbell style registers because we don't need
>>>>>       that information.
>>>> Is FAST_MMIO always for doorbell? If not, we probably need the size and
>>>> data.
>>> My understanding is that FAST_MMIO only provides the guest physical
>>> address and no additional information. In fact, I'm not even sure if we
>>> know whether the access is a read or a write.
>>>
>>> So there is extremely limited information to work with and it's
>>> basically only useful for doorbell writes.
>>>
>>>>> 2. Regular MMIO - ioregionfd_cmd size and data fields contain valid data
>>>>>       about the nature of the guest access. This is needed when the device
>>>>>       register is more than a simple "kick" doorbell. For example, if the
>>>>>       device needs to know the value that the guest wrote.
>>>>>
>>>>> I suggest defining an additional KVM_SET_IOREGION flag called
>>>>> KVM_IOREGION_FAST_MMIO that can be set together with
>>>>> KVM_IOREGION_POSTED_WRITES.
>>>> If we need to expose FAST_MMIO to userspace, we probably need to define its
>>>> semantics which is probably not easy since it's an architecture
>>>> optimization.
>>> Maybe the name KVM_IOREGION_FAST_MMIO name should be changed to
>>> something more specific like KVM_IOREGION_OFFSET_ONLY, meaning that only
>>> the offset field is valid.
>>
>> Or we can do like what eventfd did, implies FAST_MMIO when memory_size is
>> zero (kvm_assign_ioeventfd()):
>>
>>      if (!args->len && bus_idx == KVM_MMIO_BUS) {
>>          ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
>>          if (ret < 0)
>>              goto fast_fail;
>>      }
> Yes!
>
>>> I haven't checked if and how other architectures implement FAST_MMIO,
>>> but they will at least be able to provide the offset :).
>>>
>>>>> KVM_IOREGION_PIO cannot be used together with KVM_IOREGION_FAST_MMIO.
>>>>>
>>>>> In theory KVM_IOREGION_POSTED_WRITES doesn't need to be set with
>>>>> KVM_IOREGION_FAST_MMIO. Userspace would have to send back a struct
>>>>> ioregionfd_resp to acknowledge that the write has been handled.
>>>> Right, and it also depends on whether or not the hardware support (e.g
>>>> whether or not it can decode the instructions).
>>> The KVM_IOREGION_FAST_MMIO flag should be documented as an optimization
>>> hint. If hardware doesn't support FAST_MMIO then struct ioregionfd_cmd
>>> will contain all fields. Userspace will be able to process the cmd
>>> either way.
>>
>> You mean always have a fallback to MMIO for FAST_MMIO? That should be fine
>> but looks less optimal than the implying FAST_MMIO for zero length. I still
>> think introducing "FAST_MMIO" may bring confusion for users ...
> Regarding the fallback, my understanding is that ioeventfds are always
> placed on both the MMIO and FAST_MMIO bus when len is zero. That way
> architectures that don't support FAST_MMIO will still dispatch those
> ioeventfds. In virt/kvm/eventfd.c:kvm_assign_ioeventfd():
>
>    ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
>    ...
>    if (!args->len && bus_idx == KVM_MMIO_BUS) {
>        ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
>
> So ioeventfd is already doing this fallback thing.
>
> Let's follow ioeventfd:
> 1. len=0 means the size/data fields are not needed. Userspace cannot
>     rely on these fields being valid.
> 2. There is an automatic fallback to the slow MMIO bus so that slow path
>     accesses are still detected by the ioregion.
>
> The explicit KVM_IOREGION_FAST_MMIO flag I mentioned is not needed.


Agreed.

Thanks


>
> Stefan


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-13  2:38                       ` Jason Wang
@ 2021-01-13 15:52                         ` Stefan Hajnoczi
  2021-01-14  4:05                           ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-13 15:52 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 2279 bytes --]

On Wed, Jan 13, 2021 at 10:38:29AM +0800, Jason Wang wrote:
> 
> On 2021/1/8 上午1:53, Stefan Hajnoczi wrote:
> > On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
> > > On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
> > > > On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
> > > > > On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
> > > > > > On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
> > > > > > > On 2021/1/5 上午8:02, Elena Afanasova wrote:
> > > > > > > > On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> > > > > > > > > On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > > > > > > > > > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > > > > > > > > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > 2. If separate userspace threads process the virtqueues, then set up the
> >     virtio-pci capabilities so the virtqueues have separate notification
> >     registers:
> >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004
> 
> 
> Right. But this works only when PCI transport is used and queue index could
> be deduced from the register address (separated doorbell).
> 
> If we use MMIO or sharing the doorbell registers among all the virtqueues
> (multiplexer is zero in the above case) , it can't work without datamatch.

True. Can you think of an application that needs to dispatch a shared
doorbell register to several threads?

If this is a case that real-world applications need then we should
tackle it. This is where eBPF would be appropriate. I guess the
interface would be something like:

  /*
   * A custom demultiplexer function that returns the index of the <wfd,
   * rfd> pair to use or -1 to produce a KVM_EXIT_IOREGION_FAILURE that
   * userspace must handle.
   */
  int demux(const struct ioregionfd_cmd *cmd);

Userspace can install an eBPF demux function as well as an array of
<wfd, rfd> fd pairs. The demux function gets to look at the cmd in order
to decide which fd pair it is sent to.

This is how I think eBPF datamatch could work. It's not as general as in
our original discussion where we also talked about custom protocols
(instead of struct ioregionfd_cmd/struct ioregionfd_resp).

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-13 15:52                         ` Stefan Hajnoczi
@ 2021-01-14  4:05                           ` Jason Wang
  2021-01-14 16:16                             ` Stefan Hajnoczi
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2021-01-14  4:05 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/1/13 下午11:52, Stefan Hajnoczi wrote:
> On Wed, Jan 13, 2021 at 10:38:29AM +0800, Jason Wang wrote:
>> On 2021/1/8 上午1:53, Stefan Hajnoczi wrote:
>>> On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
>>>> On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
>>>>> On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
>>>>>> On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
>>>>>>> On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
>>>>>>>> On 2021/1/5 上午8:02, Elena Afanasova wrote:
>>>>>>>>> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>>>>>>>>>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>>>>>>>>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>>>>>>>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>> 2. If separate userspace threads process the virtqueues, then set up the
>>>      virtio-pci capabilities so the virtqueues have separate notification
>>>      registers:
>>>      https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004
>>
>> Right. But this works only when PCI transport is used and queue index could
>> be deduced from the register address (separated doorbell).
>>
>> If we use MMIO or sharing the doorbell registers among all the virtqueues
>> (multiplexer is zero in the above case) , it can't work without datamatch.
> True. Can you think of an application that needs to dispatch a shared
> doorbell register to several threads?


I think it depends on semantic of doorbell register. I guess one example 
is the virito-mmio multiqueue device.


>
> If this is a case that real-world applications need then we should
> tackle it. This is where eBPF would be appropriate. I guess the
> interface would be something like:
>
>    /*
>     * A custom demultiplexer function that returns the index of the <wfd,
>     * rfd> pair to use or -1 to produce a KVM_EXIT_IOREGION_FAILURE that
>     * userspace must handle.
>     */
>    int demux(const struct ioregionfd_cmd *cmd);
>
> Userspace can install an eBPF demux function as well as an array of
> <wfd, rfd> fd pairs. The demux function gets to look at the cmd in order
> to decide which fd pair it is sent to.
>
> This is how I think eBPF datamatch could work. It's not as general as in
> our original discussion where we also talked about custom protocols
> (instead of struct ioregionfd_cmd/struct ioregionfd_resp).


Actually they are not conflict. We can make it a eBPF ioregion, then 
it's the eBPF program that can decide:

1) whether or not it need to do datamatch
2) how many file descriptors it want to use (store the fd in a map)
3) how will the protocol looks like

But as discussed it could be an add-on on top of the hard logic of 
ioregion since there could be case that eBPF may not be allowed not not 
supported. So adding simple datamatch support as a start might be a good 
choice.

Thanks


>
> Stefan


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-14  4:05                           ` Jason Wang
@ 2021-01-14 16:16                             ` Stefan Hajnoczi
  2021-01-15  3:41                               ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-14 16:16 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 3409 bytes --]

On Thu, Jan 14, 2021 at 12:05:00PM +0800, Jason Wang wrote:
> 
> On 2021/1/13 下午11:52, Stefan Hajnoczi wrote:
> > On Wed, Jan 13, 2021 at 10:38:29AM +0800, Jason Wang wrote:
> > > On 2021/1/8 上午1:53, Stefan Hajnoczi wrote:
> > > > On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
> > > > > On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
> > > > > > On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
> > > > > > > On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
> > > > > > > > On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
> > > > > > > > > On 2021/1/5 上午8:02, Elena Afanasova wrote:
> > > > > > > > > > On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
> > > > > > > > > > > On 2021/1/4 上午4:32, Elena Afanasova wrote:
> > > > > > > > > > > > On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
> > > > > > > > > > > > > On 2020/12/29 下午6:02, Elena Afanasova wrote:
> > > > 2. If separate userspace threads process the virtqueues, then set up the
> > > >      virtio-pci capabilities so the virtqueues have separate notification
> > > >      registers:
> > > >      https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004
> > > 
> > > Right. But this works only when PCI transport is used and queue index could
> > > be deduced from the register address (separated doorbell).
> > > 
> > > If we use MMIO or sharing the doorbell registers among all the virtqueues
> > > (multiplexer is zero in the above case) , it can't work without datamatch.
> > True. Can you think of an application that needs to dispatch a shared
> > doorbell register to several threads?
> 
> 
> I think it depends on semantic of doorbell register. I guess one example is
> the virito-mmio multiqueue device.

Good point. virtio-mmio really needs datamatch if virtqueues are handled
by different threads.

> > If this is a case that real-world applications need then we should
> > tackle it. This is where eBPF would be appropriate. I guess the
> > interface would be something like:
> > 
> >    /*
> >     * A custom demultiplexer function that returns the index of the <wfd,
> >     * rfd> pair to use or -1 to produce a KVM_EXIT_IOREGION_FAILURE that
> >     * userspace must handle.
> >     */
> >    int demux(const struct ioregionfd_cmd *cmd);
> > 
> > Userspace can install an eBPF demux function as well as an array of
> > <wfd, rfd> fd pairs. The demux function gets to look at the cmd in order
> > to decide which fd pair it is sent to.
> > 
> > This is how I think eBPF datamatch could work. It's not as general as in
> > our original discussion where we also talked about custom protocols
> > (instead of struct ioregionfd_cmd/struct ioregionfd_resp).
> 
> 
> Actually they are not conflict. We can make it a eBPF ioregion, then it's
> the eBPF program that can decide:
> 
> 1) whether or not it need to do datamatch
> 2) how many file descriptors it want to use (store the fd in a map)
> 3) how will the protocol looks like
> 
> But as discussed it could be an add-on on top of the hard logic of ioregion
> since there could be case that eBPF may not be allowed not not supported. So
> adding simple datamatch support as a start might be a good choice.

Let's go further. Can you share pseudo-code for the eBPF program's
function signature (inputs/outputs)?

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION
  2021-01-14 16:16                             ` Stefan Hajnoczi
@ 2021-01-15  3:41                               ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-01-15  3:41 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/1/15 上午12:16, Stefan Hajnoczi wrote:
> On Thu, Jan 14, 2021 at 12:05:00PM +0800, Jason Wang wrote:
>> On 2021/1/13 下午11:52, Stefan Hajnoczi wrote:
>>> On Wed, Jan 13, 2021 at 10:38:29AM +0800, Jason Wang wrote:
>>>> On 2021/1/8 上午1:53, Stefan Hajnoczi wrote:
>>>>> On Thu, Jan 07, 2021 at 11:30:47AM +0800, Jason Wang wrote:
>>>>>> On 2021/1/6 下午11:05, Stefan Hajnoczi wrote:
>>>>>>> On Wed, Jan 06, 2021 at 01:21:43PM +0800, Jason Wang wrote:
>>>>>>>> On 2021/1/5 下午6:25, Stefan Hajnoczi wrote:
>>>>>>>>> On Tue, Jan 05, 2021 at 11:53:01AM +0800, Jason Wang wrote:
>>>>>>>>>> On 2021/1/5 上午8:02, Elena Afanasova wrote:
>>>>>>>>>>> On Mon, 2021-01-04 at 13:34 +0800, Jason Wang wrote:
>>>>>>>>>>>> On 2021/1/4 上午4:32, Elena Afanasova wrote:
>>>>>>>>>>>>> On Thu, 2020-12-31 at 11:45 +0800, Jason Wang wrote:
>>>>>>>>>>>>>> On 2020/12/29 下午6:02, Elena Afanasova wrote:
>>>>> 2. If separate userspace threads process the virtqueues, then set up the
>>>>>       virtio-pci capabilities so the virtqueues have separate notification
>>>>>       registers:
>>>>>       https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-1150004
>>>> Right. But this works only when PCI transport is used and queue index could
>>>> be deduced from the register address (separated doorbell).
>>>>
>>>> If we use MMIO or sharing the doorbell registers among all the virtqueues
>>>> (multiplexer is zero in the above case) , it can't work without datamatch.
>>> True. Can you think of an application that needs to dispatch a shared
>>> doorbell register to several threads?
>>
>> I think it depends on semantic of doorbell register. I guess one example is
>> the virito-mmio multiqueue device.
> Good point. virtio-mmio really needs datamatch if virtqueues are handled
> by different threads.
>
>>> If this is a case that real-world applications need then we should
>>> tackle it. This is where eBPF would be appropriate. I guess the
>>> interface would be something like:
>>>
>>>     /*
>>>      * A custom demultiplexer function that returns the index of the <wfd,
>>>      * rfd> pair to use or -1 to produce a KVM_EXIT_IOREGION_FAILURE that
>>>      * userspace must handle.
>>>      */
>>>     int demux(const struct ioregionfd_cmd *cmd);
>>>
>>> Userspace can install an eBPF demux function as well as an array of
>>> <wfd, rfd> fd pairs. The demux function gets to look at the cmd in order
>>> to decide which fd pair it is sent to.
>>>
>>> This is how I think eBPF datamatch could work. It's not as general as in
>>> our original discussion where we also talked about custom protocols
>>> (instead of struct ioregionfd_cmd/struct ioregionfd_resp).
>>
>> Actually they are not conflict. We can make it a eBPF ioregion, then it's
>> the eBPF program that can decide:
>>
>> 1) whether or not it need to do datamatch
>> 2) how many file descriptors it want to use (store the fd in a map)
>> 3) how will the protocol looks like
>>
>> But as discussed it could be an add-on on top of the hard logic of ioregion
>> since there could be case that eBPF may not be allowed not not supported. So
>> adding simple datamatch support as a start might be a good choice.
> Let's go further. Can you share pseudo-code for the eBPF program's
> function signature (inputs/outputs)?


It could be something like this:

1) The eBPF program context could be defined as ioregion_ctx:

struct ioregion_ctx {
     gpa_t addr;
     int len;
     void *val;
};

2) The eBPF program return value could be, 0 (IOREGION_OK) means that 
the the program can handle this I/O request, otherwise failure 
(IOREGION_FAIL)

So for implementing the datamatch, userspace is required to stored the 
file descriptors for doorbell dispatching in a map (dispatch_map). For 
virtio style doorbell, we can simply:

- find the fd via bpf map lookup
- build the protocol
- use the eBPF helper to send the command (I don't check but I guess we 
need invent new eBPF helpers for read and write from a file)

Like:

SEC("datamatch")
int datamatch_prog(struct ioregion_ctx *ctx)
{
     int *fd, ret;
     struct customized_protocol protocol;
     fd = bpf_map_lookup_elem(&ctx->val, &dispatch_map);
     if (!fd)
         return IOREGION_FAIL;
     build_protocol(ctx, &protocol);
     ret = bpf_fd_write(fd, &protocol, sizeof(protocol);
     if (ret != sizeof(protocol))
         return IOREGION_FAIL;
     return IOREGION_OK;
}

Thanks


>
> Stefan


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-01-15  3:43 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-29 10:02 [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
2020-12-29 10:02 ` [RFC 1/2] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
2020-12-29 11:36   ` Stefan Hajnoczi
2020-12-30 12:14     ` Elena Afanasova
2020-12-31  3:45   ` Jason Wang
2021-01-03 20:32     ` Elena Afanasova
2021-01-04  5:34       ` Jason Wang
2021-01-05  0:02         ` Elena Afanasova
2021-01-05  3:53           ` Jason Wang
2021-01-05 10:25             ` Stefan Hajnoczi
2021-01-06  5:21               ` Jason Wang
2021-01-06 15:05                 ` Stefan Hajnoczi
2021-01-07  3:30                   ` Jason Wang
2021-01-07 17:53                     ` Stefan Hajnoczi
2021-01-13  2:38                       ` Jason Wang
2021-01-13 15:52                         ` Stefan Hajnoczi
2021-01-14  4:05                           ` Jason Wang
2021-01-14 16:16                             ` Stefan Hajnoczi
2021-01-15  3:41                               ` Jason Wang
2020-12-29 10:02 ` [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations Elena Afanasova
2020-12-29 12:00   ` Stefan Hajnoczi
2020-12-30 12:24     ` Elena Afanasova
2020-12-31  3:46   ` Jason Wang
2021-01-03 20:37     ` Elena Afanasova
2021-01-04  5:37       ` Jason Wang
2021-01-05  0:06         ` Elena Afanasova
2020-12-29 12:06 ` [RFC 0/2] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
2020-12-30 17:56   ` Elena Afanasova

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).