All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
@ 2021-01-28 18:32 Elena Afanasova
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-01-28 18:32 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

This patchset introduces a KVM dispatch mechanism which can be used 
for handling MMIO/PIO accesses over file descriptors without returning 
from ioctl(KVM_RUN). This allows device emulation to run in another task 
separate from the vCPU task.

This is achieved through KVM vm ioctl for registering MMIO/PIO regions and 
a wire protocol that KVM uses to communicate with a task handling an 
MMIO/PIO access.

TODOs:
* Implement KVM_EXIT_IOREGIONFD_FAILURE
* Add non-x86 arch support
* Add kvm-unittests

Elena Afanasova (4):
  KVM: add initial support for KVM_SET_IOREGION
  KVM: x86: add support for ioregionfd signal handling
  KVM: add support for ioregionfd cmds/replies serialization
  KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled

 arch/x86/kvm/Kconfig          |   1 +
 arch/x86/kvm/Makefile         |   1 +
 arch/x86/kvm/x86.c            | 216 ++++++++++++++-
 include/kvm/iodev.h           |  14 +
 include/linux/kvm_host.h      |  34 +++
 include/uapi/linux/ioregion.h |  32 +++
 include/uapi/linux/kvm.h      |  23 ++
 virt/kvm/Kconfig              |   3 +
 virt/kvm/eventfd.c            |  25 ++
 virt/kvm/eventfd.h            |  14 +
 virt/kvm/ioregion.c           | 479 ++++++++++++++++++++++++++++++++++
 virt/kvm/ioregion.h           |  15 ++
 virt/kvm/kvm_main.c           |  68 ++++-
 13 files changed, 905 insertions(+), 20 deletions(-)
 create mode 100644 include/uapi/linux/ioregion.h
 create mode 100644 virt/kvm/eventfd.h
 create mode 100644 virt/kvm/ioregion.c
 create mode 100644 virt/kvm/ioregion.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
@ 2021-01-28 18:32 ` Elena Afanasova
  2021-01-30 16:58   ` Stefan Hajnoczi
                     ` (2 more replies)
  2021-01-28 18:32 ` [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization Elena Afanasova
                   ` (5 subsequent siblings)
  6 siblings, 3 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-01-28 18:32 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

The vCPU thread may receive a signal during ioregionfd communication,
ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
must resume ioregionfd.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
Changes in v2:
  - add support for x86 signal handling
  - changes after code review

 arch/x86/kvm/x86.c            | 196 +++++++++++++++++++++++++++++++---
 include/linux/kvm_host.h      |  13 +++
 include/uapi/linux/ioregion.h |  32 ++++++
 virt/kvm/ioregion.c           | 177 +++++++++++++++++++++++++++++-
 virt/kvm/kvm_main.c           |  16 ++-
 5 files changed, 415 insertions(+), 19 deletions(-)
 create mode 100644 include/uapi/linux/ioregion.h

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ddb28f5ca252..a04516b531da 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 {
 	int handled = 0;
 	int n;
+	int ret = 0;
+	bool is_apic;
 
 	do {
 		n = min(len, 8);
-		if (!(lapic_in_kernel(vcpu) &&
-		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
-		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
-			break;
+		is_apic = lapic_in_kernel(vcpu) &&
+			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
+					      addr, n, v);
+		if (!is_apic) {
+			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
+					       addr, n, v);
+			if (ret)
+				break;
+		}
 		handled += n;
 		addr += n;
 		len -= n;
 		v += n;
 	} while (len);
 
+#ifdef CONFIG_KVM_IOREGION
+	if (ret == -EINTR) {
+		vcpu->run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+	}
+#endif
+
 	return handled;
 }
 
@@ -5819,14 +5833,20 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 {
 	int handled = 0;
 	int n;
+	int ret = 0;
+	bool is_apic;
 
 	do {
 		n = min(len, 8);
-		if (!(lapic_in_kernel(vcpu) &&
-		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
-					 addr, n, v))
-		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
-			break;
+		is_apic = lapic_in_kernel(vcpu) &&
+			  !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+					     addr, n, v);
+		if (!is_apic) {
+			ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS,
+					      addr, n, v);
+			if (ret)
+				break;
+		}
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
 		handled += n;
 		addr += n;
@@ -5834,6 +5854,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 		v += n;
 	} while (len);
 
+#ifdef CONFIG_KVM_IOREGION
+	if (ret == -EINTR) {
+		vcpu->run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+	}
+#endif
+
 	return handled;
 }
 
@@ -6294,6 +6321,12 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_cur_fragment = 0;
 
+#ifdef CONFIG_KVM_IOREGION
+	if (vcpu->ioregion_interrupted &&
+	    vcpu->run->exit_reason == KVM_EXIT_INTR)
+		return (vcpu->ioregion_ctx.in) ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+#endif
+
 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
@@ -6411,16 +6444,23 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 
 	for (i = 0; i < vcpu->arch.pio.count; i++) {
 		if (vcpu->arch.pio.in)
-			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
+					    vcpu->arch.pio.port,
 					    vcpu->arch.pio.size, pd);
 		else
 			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
-					     vcpu->arch.pio.port, vcpu->arch.pio.size,
-					     pd);
+					     vcpu->arch.pio.port,
+					     vcpu->arch.pio.size, pd);
 		if (r)
 			break;
 		pd += vcpu->arch.pio.size;
 	}
+#ifdef CONFIG_KVM_IOREGION
+	if (vcpu->ioregion_interrupted && r == -EINTR) {
+		vcpu->ioregion_ctx.pio = i;
+	}
+#endif
+
 	return r;
 }
 
@@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
 			       unsigned short port, void *val,
 			       unsigned int count, bool in)
 {
+	int ret = 0;
+
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
+	if (!ret) {
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
+#ifdef CONFIG_KVM_IOREGION
+	if (ret == -EINTR) {
+		vcpu->run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+		return 0;
+	}
+#endif
+
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 	vcpu->run->io.size = size;
@@ -7141,6 +7192,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM_IOREGION
+static int complete_ioregion_io(struct kvm_vcpu *vcpu);
+static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu);
+#endif
 
 static void kvm_smm_changed(struct kvm_vcpu *vcpu)
 {
@@ -7405,6 +7460,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		r = 1;
 		if (inject_emulated_exception(vcpu))
 			return r;
+#ifdef CONFIG_KVM_IOREGION
+	} else if (vcpu->ioregion_interrupted &&
+		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
+		if (vcpu->ioregion_ctx.in)
+			writeback = false;
+		vcpu->arch.complete_userspace_io = complete_ioregion_io;
+		r = 0;
+#endif
 	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in) {
 			/* FIXME: return into emulator if single-stepping.  */
@@ -7501,6 +7564,11 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
 		vcpu->arch.complete_userspace_io =
 			complete_fast_pio_out_port_0x7e;
 		kvm_skip_emulated_instruction(vcpu);
+#ifdef CONFIG_KVM_IOREGION
+	} else if (vcpu->ioregion_interrupted &&
+		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
+		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
+#endif
 	} else {
 		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
 		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
@@ -7548,6 +7616,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
 		return ret;
 	}
 
+#ifdef CONFIG_KVM_IOREGION
+	if (vcpu->ioregion_interrupted &&
+	    vcpu->run->exit_reason == KVM_EXIT_INTR) {
+		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
+		return 0;
+	}
+#endif
 	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
 	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
 
@@ -9204,6 +9279,101 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+#ifdef CONFIG_KVM_IOREGION
+static void complete_ioregion_access(struct kvm_vcpu *vcpu, gpa_t addr,
+				     int len, void *val)
+{
+	if (vcpu->ioregion_ctx.in)
+		vcpu->ioregion_ctx.dev->ops->read(vcpu, vcpu->ioregion_ctx.dev,
+						  addr, len, val);
+	else
+		vcpu->ioregion_ctx.dev->ops->write(vcpu, vcpu->ioregion_ctx.dev,
+						   addr, len, val);
+}
+
+static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmio_fragment *frag;
+	int idx, ret, i, n;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	for (i = vcpu->mmio_cur_fragment; i < vcpu->mmio_nr_fragments; i++) {
+		frag = &vcpu->mmio_fragments[i];
+		do {
+			n = min(8u, frag->len);
+			complete_ioregion_access(vcpu, frag->gpa, n, frag->data);
+			frag->len -= n;
+			frag->data += n;
+			frag->gpa += n;
+		} while (frag->len);
+		vcpu->mmio_cur_fragment++;
+	}
+
+	vcpu->mmio_needed = 0;
+	if (!vcpu->ioregion_ctx.in) {
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		return 1;
+	}
+
+	vcpu->mmio_read_completed = 1;
+	ret = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+
+static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
+{
+	int i, idx, r = 1;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	for (i = vcpu->ioregion_ctx.pio; i < vcpu->arch.pio.count; i++) {
+		complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
+					 vcpu->ioregion_ctx.len,
+					 vcpu->ioregion_ctx.val);
+		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
+	}
+
+	if (vcpu->ioregion_ctx.in)
+		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	vcpu->arch.pio.count = 0;
+
+	return r;
+}
+
+static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
+{
+	int idx;
+	u64 val;
+
+	BUG_ON(!vcpu->ioregion_interrupted);
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
+				 vcpu->ioregion_ctx.len,
+				 vcpu->ioregion_ctx.val);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (vcpu->ioregion_ctx.in) {
+		memcpy(&val, vcpu->ioregion_ctx.val, vcpu->ioregion_ctx.len);
+		kvm_rax_write(vcpu, val);
+	}
+	vcpu->arch.pio.count = 0;
+
+	return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_ioregion_io(struct kvm_vcpu *vcpu)
+{
+	BUG_ON(!vcpu->ioregion_interrupted);
+
+	if (vcpu->mmio_needed)
+		return complete_ioregion_mmio(vcpu);
+	if (vcpu->arch.pio.count)
+		return complete_ioregion_pio(vcpu);
+}
+#endif
+
 static void kvm_save_current_fpu(struct fpu *fpu)
 {
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7cd667dddba9..5cfdecfca6db 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -318,6 +318,19 @@ struct kvm_vcpu {
 #endif
 	bool preempted;
 	bool ready;
+#ifdef CONFIG_KVM_IOREGION
+	bool ioregion_interrupted;
+	struct {
+		struct kvm_io_device *dev;
+		int pio;
+		void *val;
+		u8 state;
+		u64 addr;
+		int len;
+		u64 data;
+		bool in;
+	} ioregion_ctx;
+#endif
 	struct kvm_vcpu_arch arch;
 };
 
diff --git a/include/uapi/linux/ioregion.h b/include/uapi/linux/ioregion.h
new file mode 100644
index 000000000000..7898c01f84a1
--- /dev/null
+++ b/include/uapi/linux/ioregion.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_IOREGION_H
+#define _UAPI_LINUX_IOREGION_H
+
+/* Wire protocol */
+struct ioregionfd_cmd {
+	__u32 info;
+	__u32 padding;
+	__u64 user_data;
+	__u64 offset;
+	__u64 data;
+};
+
+struct ioregionfd_resp {
+	__u64 data;
+	__u8 pad[24];
+};
+
+#define IOREGIONFD_CMD_READ    0
+#define IOREGIONFD_CMD_WRITE   1
+
+#define IOREGIONFD_SIZE_8BIT   0
+#define IOREGIONFD_SIZE_16BIT  1
+#define IOREGIONFD_SIZE_32BIT  2
+#define IOREGIONFD_SIZE_64BIT  3
+
+#define IOREGIONFD_SIZE_OFFSET 4
+#define IOREGIONFD_RESP_OFFSET 6
+#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
+#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
+
+#endif
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
index 48ff92bca966..da38124e1418 100644
--- a/virt/kvm/ioregion.c
+++ b/virt/kvm/ioregion.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <kvm/iodev.h>
 #include "eventfd.h"
+#include <uapi/linux/ioregion.h>
 
 void
 kvm_ioregionfd_init(struct kvm *kvm)
@@ -38,18 +39,190 @@ ioregion_release(struct ioregion *p)
 	kfree(p);
 }
 
+static bool
+pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, int resp,
+	 u64 user_data, const void *val)
+{
+	u64 size = 0;
+
+	switch (len) {
+	case 1:
+		size = IOREGIONFD_SIZE_8BIT;
+		break;
+	case 2:
+		size = IOREGIONFD_SIZE_16BIT;
+		break;
+	case 4:
+		size = IOREGIONFD_SIZE_32BIT;
+		break;
+	case 8:
+		size = IOREGIONFD_SIZE_64BIT;
+		break;
+	default:
+		return false;
+	}
+
+	if (val)
+		memcpy(&cmd->data, val, len);
+	cmd->user_data = user_data;
+	cmd->offset = offset;
+	cmd->info |= opt;
+	cmd->info |= IOREGIONFD_SIZE(size);
+	cmd->info |= IOREGIONFD_RESP(resp);
+
+	return true;
+}
+
+enum {
+	SEND_CMD,
+	GET_REPLY,
+	COMPLETE
+};
+
+static void
+ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+		  bool in, gpa_t addr, int len, u64 data, u8 state, void *val)
+{
+	vcpu->ioregion_interrupted = true;
+
+	vcpu->ioregion_ctx.dev = this;
+	vcpu->ioregion_ctx.val = val;
+	vcpu->ioregion_ctx.state = state;
+	vcpu->ioregion_ctx.addr = addr;
+	vcpu->ioregion_ctx.len = len;
+	vcpu->ioregion_ctx.data = data;
+	vcpu->ioregion_ctx.in = in;
+}
+
 static int
 ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	      int len, void *val)
 {
-	return -EOPNOTSUPP;
+	struct ioregion *p = to_ioregion(this);
+	union {
+		struct ioregionfd_cmd cmd;
+		struct ioregionfd_resp resp;
+	} buf;
+	int ret = 0;
+	int state = 0;
+
+	if ((addr + len - 1) > (p->paddr + p->size - 1))
+		return -EINVAL;
+
+	if (unlikely(vcpu->ioregion_interrupted)) {
+		vcpu->ioregion_interrupted = false;
+
+		switch (vcpu->ioregion_ctx.state) {
+		case SEND_CMD:
+			goto send_cmd;
+		case GET_REPLY:
+			goto get_repl;
+		case COMPLETE:
+			memcpy(val, &vcpu->ioregion_ctx.data, len);
+			return 0;
+		}
+	}
+
+send_cmd:
+	memset(&buf, 0, sizeof(buf));
+	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
+		      1, p->user_data, NULL))
+		return -EOPNOTSUPP;
+
+	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
+	state = (ret == sizeof(buf.cmd));
+	if (signal_pending(current)) {
+		ioregion_save_ctx(vcpu, this, 1, addr, len, 0, state, val);
+		return -EINTR;
+	}
+	if (ret != sizeof(buf.cmd)) {
+		ret = (ret < 0) ? ret : -EIO;
+		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+	}
+
+get_repl:
+	memset(&buf, 0, sizeof(buf));
+	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+	state += (ret == sizeof(buf.resp));
+	if (signal_pending(current)) {
+		ioregion_save_ctx(vcpu, this, 1, addr, len, buf.resp.data, state, val);
+		return -EINTR;
+	}
+	if (ret != sizeof(buf.resp)) {
+		ret = (ret < 0) ? ret : -EIO;
+		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+	}
+
+	memcpy(val, &buf.resp.data, len);
+
+	return 0;
 }
 
 static int
 ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 		int len, const void *val)
 {
-	return -EOPNOTSUPP;
+	struct ioregion *p = to_ioregion(this);
+	union {
+		struct ioregionfd_cmd cmd;
+		struct ioregionfd_resp resp;
+	} buf;
+	int ret = 0;
+	int state = 0;
+
+	if ((addr + len - 1) > (p->paddr + p->size - 1))
+		return -EINVAL;
+
+	if (unlikely(vcpu->ioregion_interrupted)) {
+		vcpu->ioregion_interrupted = false;
+
+		switch (vcpu->ioregion_ctx.state) {
+		case SEND_CMD:
+			goto send_cmd;
+		case GET_REPLY:
+			if (!p->posted_writes)
+				goto get_repl;
+			fallthrough;
+		case COMPLETE:
+			return 0;
+		}
+	}
+
+send_cmd:
+	memset(&buf, 0, sizeof(buf));
+	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE,
+		      p->posted_writes ? 0 : 1, p->user_data, val))
+		return -EOPNOTSUPP;
+
+	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
+	state = (ret == sizeof(buf.cmd));
+	if (signal_pending(current)) {
+		ioregion_save_ctx(vcpu, this, 0, addr, len,
+				  0, state, (void *)val);
+		return -EINTR;
+	}
+	if (ret != sizeof(buf.cmd)) {
+		ret = (ret < 0) ? ret : -EIO;
+		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+	}
+
+get_repl:
+	if (!p->posted_writes) {
+		memset(&buf, 0, sizeof(buf));
+		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+		state += (ret == sizeof(buf.resp));
+		if (signal_pending(current)) {
+			ioregion_save_ctx(vcpu, this, 0, addr, len,
+					  0, state, (void *)val);
+			return -EINTR;
+		}
+		if (ret != sizeof(buf.resp)) {
+			ret = (ret < 0) ? ret : -EIO;
+			return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		}
+	}
+
+	return 0;
 }
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 88b92fc3da51..df387857f51f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 			      struct kvm_io_range *range, const void *val)
 {
 	int idx;
+	int ret = 0;
 
 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
 	if (idx < 0)
@@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 
 	while (idx < bus->dev_count &&
 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
-					range->len, val))
+		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
+					 range->len, val);
+		if (!ret)
 			return idx;
+		if (ret < 0 && ret != -EOPNOTSUPP)
+			return ret;
 		idx++;
 	}
 
@@ -4264,6 +4268,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 			     struct kvm_io_range *range, void *val)
 {
 	int idx;
+	int ret = 0;
 
 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
 	if (idx < 0)
@@ -4271,9 +4276,12 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 
 	while (idx < bus->dev_count &&
 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
-				       range->len, val))
+		ret = kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
+					range->len, val);
+		if (!ret)
 			return idx;
+		if (ret < 0 && ret != -EOPNOTSUPP)
+			return ret;
 		idx++;
 	}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
@ 2021-01-28 18:32 ` Elena Afanasova
  2021-01-30 18:54   ` Stefan Hajnoczi
  2021-01-28 18:32 ` [RFC v2 4/4] KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled Elena Afanasova
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Elena Afanasova @ 2021-01-28 18:32 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

Add ioregionfd context and kvm_io_device_ops->prepare/finish()
in order to serialize all bytes requested by guest.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
 arch/x86/kvm/x86.c       |  19 ++++++++
 include/kvm/iodev.h      |  14 ++++++
 include/linux/kvm_host.h |   4 ++
 virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++------
 virt/kvm/kvm_main.c      |  32 ++++++++++++
 5 files changed, 157 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a04516b531da..393fb0f4bf46 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5802,6 +5802,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	int ret = 0;
 	bool is_apic;
 
+	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
+
 	do {
 		n = min(len, 8);
 		is_apic = lapic_in_kernel(vcpu) &&
@@ -5823,8 +5825,10 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	if (ret == -EINTR) {
 		vcpu->run->exit_reason = KVM_EXIT_INTR;
 		++vcpu->stat.signal_exits;
+		return handled;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
 
 	return handled;
 }
@@ -5836,6 +5840,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	int ret = 0;
 	bool is_apic;
 
+	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
+
 	do {
 		n = min(len, 8);
 		is_apic = lapic_in_kernel(vcpu) &&
@@ -5858,8 +5864,10 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	if (ret == -EINTR) {
 		vcpu->run->exit_reason = KVM_EXIT_INTR;
 		++vcpu->stat.signal_exits;
+		return handled;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
 
 	return handled;
 }
@@ -6442,6 +6450,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	int r = 0, i;
 
+	kvm_io_bus_prepare(vcpu, KVM_PIO_BUS,
+			   vcpu->arch.pio.port,
+			   vcpu->arch.pio.size);
+
 	for (i = 0; i < vcpu->arch.pio.count; i++) {
 		if (vcpu->arch.pio.in)
 			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
@@ -6458,8 +6470,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 #ifdef CONFIG_KVM_IOREGION
 	if (vcpu->ioregion_interrupted && r == -EINTR) {
 		vcpu->ioregion_ctx.pio = i;
+		return r;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_PIO_BUS,
+			  vcpu->arch.pio.port,
+			  vcpu->arch.pio.size);
 
 	return r;
 }
@@ -9309,6 +9325,7 @@ static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
 		vcpu->mmio_cur_fragment++;
 	}
 
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	vcpu->mmio_needed = 0;
 	if (!vcpu->ioregion_ctx.in) {
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -9333,6 +9350,7 @@ static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
 		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
 	}
 
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	if (vcpu->ioregion_ctx.in)
 		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -9352,6 +9370,7 @@ static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
 	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
 				 vcpu->ioregion_ctx.len,
 				 vcpu->ioregion_ctx.val);
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 	if (vcpu->ioregion_ctx.in) {
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
index d75fc4365746..db8a3c69b7bb 100644
--- a/include/kvm/iodev.h
+++ b/include/kvm/iodev.h
@@ -25,6 +25,8 @@ struct kvm_io_device_ops {
 		     gpa_t addr,
 		     int len,
 		     const void *val);
+	void (*prepare)(struct kvm_io_device *this);
+	void (*finish)(struct kvm_io_device *this);
 	void (*destructor)(struct kvm_io_device *this);
 };
 
@@ -55,6 +57,18 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
 				 : -EOPNOTSUPP;
 }
 
+static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
+{
+	if (dev->ops->prepare)
+		dev->ops->prepare(dev);
+}
+
+static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
+{
+	if (dev->ops->finish)
+		dev->ops->finish(dev);
+}
+
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
 {
 	if (dev->ops->destructor)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5cfdecfca6db..f6b9ff4c468d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -194,6 +194,10 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			       struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 					 gpa_t addr);
+void kvm_io_bus_prepare(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+			int len);
+void kvm_io_bus_finish(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+		       int len);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
index da38124e1418..3474090ccc8c 100644
--- a/virt/kvm/ioregion.c
+++ b/virt/kvm/ioregion.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kvm_host.h>
-#include <linux/fs.h>
+#include <linux/wait.h>
 #include <kvm/iodev.h>
 #include "eventfd.h"
 #include <uapi/linux/ioregion.h>
@@ -12,15 +12,23 @@ kvm_ioregionfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioregions_pio);
 }
 
+/* Serializes ioregionfd cmds/replies */
+struct ioregionfd {
+	wait_queue_head_t	  wq;
+	struct file		 *rf;
+	struct kref		  kref;
+	bool			  busy;
+};
+
 struct ioregion {
-	struct list_head     list;
-	u64                  paddr;  /* guest physical address */
-	u64                  size;   /* size in bytes */
-	struct file         *rf;
-	struct file         *wf;
-	u64                  user_data; /* opaque token used by userspace */
-	struct kvm_io_device dev;
-	bool                 posted_writes;
+	struct list_head	  list;
+	u64			  paddr;   /* guest physical address */
+	u64			  size;    /* size in bytes */
+	struct file		 *wf;
+	u64			  user_data; /* opaque token used by userspace */
+	struct kvm_io_device	  dev;
+	bool			  posted_writes;
+	struct ioregionfd	 *ctx;
 };
 
 static inline struct ioregion *
@@ -29,13 +37,22 @@ to_ioregion(struct kvm_io_device *dev)
 	return container_of(dev, struct ioregion, dev);
 }
 
+/* assumes kvm->slots_lock held */
+static void ctx_free(struct kref *kref)
+{
+	struct ioregionfd *ctx = container_of(kref, struct ioregionfd, kref);
+
+	kfree(ctx);
+}
+
 /* assumes kvm->slots_lock held */
 static void
 ioregion_release(struct ioregion *p)
 {
-	fput(p->rf);
+	fput(p->ctx->rf);
 	fput(p->wf);
 	list_del(&p->list);
+	kref_put(&p->ctx->kref, ctx_free);
 	kfree(p);
 }
 
@@ -94,6 +111,28 @@ ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 	vcpu->ioregion_ctx.in = in;
 }
 
+static void
+ioregion_prepare(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	spin_lock(&p->ctx->wq.lock);
+	wait_event_interruptible_exclusive_locked(p->ctx->wq, !p->ctx->busy);
+	p->ctx->busy = true;
+	spin_unlock(&p->ctx->wq.lock);
+}
+
+static void
+ioregion_finish(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	spin_lock(&p->ctx->wq.lock);
+	p->ctx->busy = false;
+	wake_up_locked(&p->ctx->wq);
+	spin_unlock(&p->ctx->wq.lock);
+}
+
 static int
 ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	      int len, void *val)
@@ -142,7 +181,7 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 
 get_repl:
 	memset(&buf, 0, sizeof(buf));
-	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+	ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 	state += (ret == sizeof(buf.resp));
 	if (signal_pending(current)) {
 		ioregion_save_ctx(vcpu, this, 1, addr, len, buf.resp.data, state, val);
@@ -209,7 +248,7 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 get_repl:
 	if (!p->posted_writes) {
 		memset(&buf, 0, sizeof(buf));
-		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+		ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 		state += (ret == sizeof(buf.resp));
 		if (signal_pending(current)) {
 			ioregion_save_ctx(vcpu, this, 0, addr, len,
@@ -240,6 +279,8 @@ ioregion_destructor(struct kvm_io_device *this)
 static const struct kvm_io_device_ops ioregion_ops = {
 	.read       = ioregion_read,
 	.write      = ioregion_write,
+	.prepare    = ioregion_prepare,
+	.finish     = ioregion_finish,
 	.destructor = ioregion_destructor,
 };
 
@@ -295,6 +336,34 @@ get_bus_from_flags(__u32 flags)
 	return KVM_MMIO_BUS;
 }
 
+/* assumes kvm->slots_lock held */
+static bool
+ioregion_get_ctx(struct kvm *kvm, struct ioregion *p, struct file *rf, int bus_idx)
+{
+	struct ioregion *_p;
+	struct list_head *ioregions;
+
+	ioregions = get_ioregion_list(kvm, bus_idx);
+	list_for_each_entry(_p, ioregions, list)
+		if (file_inode(_p->ctx->rf)->i_ino == file_inode(rf)->i_ino) {
+			p->ctx = _p->ctx;
+			kref_get(&p->ctx->kref);
+			return true;
+		}
+
+	p->ctx = kzalloc(sizeof(*p->ctx), GFP_KERNEL_ACCOUNT);
+	if (!p->ctx) {
+		kfree(p);
+		return false;
+	}
+	p->ctx->rf = rf;
+	p->ctx->busy = false;
+	init_waitqueue_head(&p->ctx->wq);
+	kref_get(&p->ctx->kref);
+
+	return true;
+}
+
 int
 kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 {
@@ -327,11 +396,10 @@ kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 	}
 
 	INIT_LIST_HEAD(&p->list);
+	p->wf = wfile;
 	p->paddr = args->guest_paddr;
 	p->size = args->memory_size;
 	p->user_data = args->user_data;
-	p->rf = rfile;
-	p->wf = wfile;
 	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
 	bus_idx = get_bus_from_flags(args->flags);
 
@@ -341,6 +409,12 @@ kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 		ret = -EEXIST;
 		goto unlock_fail;
 	}
+
+	if (!ioregion_get_ctx(kvm, p, rfile, bus_idx)) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
 	kvm_iodevice_init(&p->dev, &ioregion_ops);
 	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
 				      &p->dev);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index df387857f51f..096504a6cc62 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4308,6 +4308,38 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	return r < 0 ? r : 0;
 }
 
+void kvm_io_bus_prepare(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len)
+{
+	struct kvm_io_bus *bus;
+	int idx;
+
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return;
+
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return;
+
+	kvm_iodevice_prepare(bus->range[idx].dev);
+}
+
+void kvm_io_bus_finish(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len)
+{
+	struct kvm_io_bus *bus;
+	int idx;
+
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return;
+
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return;
+
+	kvm_iodevice_finish(bus->range[idx].dev);
+}
+
 /* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC v2 4/4] KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
  2021-01-28 18:32 ` [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization Elena Afanasova
@ 2021-01-28 18:32 ` Elena Afanasova
  2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-01-28 18:32 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

ioregionfd relies on kmemcg in order to limit the amount of kernel memory
that userspace can consume. Enforce NR_IOBUS_DEVS hardcoded limit in case
kmemcg is disabled.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
 virt/kvm/kvm_main.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 096504a6cc62..74bedb7272e6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4352,9 +4352,12 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (!bus)
 		return -ENOMEM;
 
-	/* exclude ioeventfd which is limited by maximum fd */
-	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
-		return -ENOSPC;
+	/* enforce hard limit if kmemcg is disabled and
+	 * exclude ioeventfd which is limited by maximum fd
+	 */
+	if (!memcg_kmem_enabled())
+		if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
+			return -ENOSPC;
 
 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
 			  GFP_KERNEL_ACCOUNT);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
                   ` (2 preceding siblings ...)
  2021-01-28 18:32 ` [RFC v2 4/4] KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled Elena Afanasova
@ 2021-01-29 18:48 ` Elena Afanasova
  2021-01-30 15:04   ` Stefan Hajnoczi
                     ` (2 more replies)
  2021-01-30 14:56 ` [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
                   ` (2 subsequent siblings)
  6 siblings, 3 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-01-29 18:48 UTC (permalink / raw)
  To: kvm; +Cc: stefanha, jag.raman, elena.ufimtseva, Elena Afanasova

This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
read and write accesses are dispatched through the given ioregionfd
instead of returning from ioctl(KVM_RUN). Regions can be deleted by
setting fds to -1.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
Changes in v2:
  - changes after code review

 arch/x86/kvm/Kconfig     |   1 +
 arch/x86/kvm/Makefile    |   1 +
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm_host.h |  17 +++
 include/uapi/linux/kvm.h |  23 ++++
 virt/kvm/Kconfig         |   3 +
 virt/kvm/eventfd.c       |  25 +++++
 virt/kvm/eventfd.h       |  14 +++
 virt/kvm/ioregion.c      | 232 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/ioregion.h      |  15 +++
 virt/kvm/kvm_main.c      |  11 ++
 11 files changed, 343 insertions(+)
 create mode 100644 virt/kvm/eventfd.h
 create mode 100644 virt/kvm/ioregion.c
 create mode 100644 virt/kvm/ioregion.h

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..b914ef375199 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -33,6 +33,7 @@ config KVM
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
+	select KVM_IOREGION
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..b3b17dc9f7d4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
 				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
+kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e545a8a613b1..ddb28f5ca252 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_USER_SPACE_MSR:
 	case KVM_CAP_X86_MSR_FILTER:
 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+	case KVM_CAP_IOREGIONFD:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..7cd667dddba9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,6 +470,10 @@ struct kvm {
 		struct mutex      resampler_lock;
 	} irqfds;
 	struct list_head ioeventfds;
+#endif
+#ifdef CONFIG_KVM_IOREGION
+	struct list_head ioregions_mmio;
+	struct list_head ioregions_pio;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
@@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
+#ifdef CONFIG_KVM_IOREGION
+void kvm_ioregionfd_init(struct kvm *kvm);
+int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
+
+#else
+
+static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
+static inline int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	return -ENOSYS;
+}
+#endif
+
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8..81e775778c66 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -732,6 +732,27 @@ struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+enum {
+	kvm_ioregion_flag_nr_pio,
+	kvm_ioregion_flag_nr_posted_writes,
+	kvm_ioregion_flag_nr_max,
+};
+
+#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
+#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
+
+#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
+
+struct kvm_ioregion {
+	__u64 guest_paddr; /* guest physical address */
+	__u64 memory_size; /* bytes */
+	__u64 user_data;
+	__s32 rfd;
+	__s32 wfd;
+	__u32 flags;
+	__u8  pad[28];
+};
+
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
@@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_IOREGIONFD 191
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..5e6620bbf000 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
 
+config KVM_IOREGION
+       bool
+
 config KVM_MMIO
        bool
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c2323c27a28b..aadb73903f8b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -27,6 +27,7 @@
 #include <trace/events/kvm.h>
 
 #include <kvm/iodev.h>
+#include "ioregion.h"
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 
@@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
 	.destructor = ioeventfd_destructor,
 };
 
+#ifdef CONFIG_KVM_IOREGION
+/* assumes kvm->slots_lock held */
+bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
+			  u64 start, u64 size)
+{
+	struct _ioeventfd *_p;
+
+	list_for_each_entry(_p, &kvm->ioeventfds, list)
+		if (_p->bus_idx == bus_idx &&
+		    overlap(start, size, _p->addr,
+			    !_p->length ? 8 : _p->length))
+			return true;
+
+	return false;
+}
+#endif
+
 /* assumes kvm->slots_lock held */
 static bool
 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
@@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 		       _p->datamatch == p->datamatch))))
 			return true;
 
+#ifdef CONFIG_KVM_IOREGION
+	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
+		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
+					  !p->length ? 8 : p->length))
+			return true;
+#endif
+
 	return false;
 }
 
diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
new file mode 100644
index 000000000000..73a621eebae3
--- /dev/null
+++ b/virt/kvm/eventfd.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_EVENTFD_H__
+#define __KVM_EVENTFD_H__
+
+#ifdef CONFIG_KVM_IOREGION
+bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
+#else
+static inline bool
+kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
+{
+	return false;
+}
+#endif
+#endif
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
new file mode 100644
index 000000000000..48ff92bca966
--- /dev/null
+++ b/virt/kvm/ioregion.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <kvm/iodev.h>
+#include "eventfd.h"
+
+void
+kvm_ioregionfd_init(struct kvm *kvm)
+{
+	INIT_LIST_HEAD(&kvm->ioregions_mmio);
+	INIT_LIST_HEAD(&kvm->ioregions_pio);
+}
+
+struct ioregion {
+	struct list_head     list;
+	u64                  paddr;  /* guest physical address */
+	u64                  size;   /* size in bytes */
+	struct file         *rf;
+	struct file         *wf;
+	u64                  user_data; /* opaque token used by userspace */
+	struct kvm_io_device dev;
+	bool                 posted_writes;
+};
+
+static inline struct ioregion *
+to_ioregion(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct ioregion, dev);
+}
+
+/* assumes kvm->slots_lock held */
+static void
+ioregion_release(struct ioregion *p)
+{
+	fput(p->rf);
+	fput(p->wf);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static int
+ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+	      int len, void *val)
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+		int len, const void *val)
+{
+	return -EOPNOTSUPP;
+}
+
+/*
+ * This function is called as KVM is completely shutting down.  We do not
+ * need to worry about locking just nuke anything we have as quickly as possible
+ */
+static void
+ioregion_destructor(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	ioregion_release(p);
+}
+
+static const struct kvm_io_device_ops ioregion_ops = {
+	.read       = ioregion_read,
+	.write      = ioregion_write,
+	.destructor = ioregion_destructor,
+};
+
+static inline struct list_head *
+get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
+{
+	return (bus_idx == KVM_MMIO_BUS) ?
+		&kvm->ioregions_mmio : &kvm->ioregions_pio;
+}
+
+/* check for not overlapping case and reverse */
+inline bool
+overlap(u64 start1, u64 size1, u64 start2, u64 size2)
+{
+	u64 end1 = start1 + size1 - 1;
+	u64 end2 = start2 + size2 - 1;
+
+	return !(end1 < start2 || start1 >= end2);
+}
+
+/* assumes kvm->slots_lock held */
+bool
+kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
+		      u64 start, u64 size)
+{
+	struct ioregion *_p;
+	struct list_head *ioregions;
+
+	ioregions = get_ioregion_list(kvm, bus_idx);
+	list_for_each_entry(_p, ioregions, list)
+		if (overlap(start, size, _p->paddr, _p->size))
+			return true;
+
+	return false;
+}
+
+/* assumes kvm->slots_lock held */
+static bool
+ioregion_collision(struct kvm *kvm, struct ioregion *p, enum kvm_bus bus_idx)
+{
+	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
+	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
+		return true;
+
+	return false;
+}
+
+static enum kvm_bus
+get_bus_from_flags(__u32 flags)
+{
+	if (flags & KVM_IOREGION_PIO)
+		return KVM_PIO_BUS;
+	return KVM_MMIO_BUS;
+}
+
+int
+kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	struct ioregion *p;
+	struct file *rfile, *wfile;
+	enum kvm_bus bus_idx;
+	int ret = 0;
+
+	if (!args->memory_size)
+		return -EINVAL;
+	if ((args->guest_paddr + args->memory_size - 1) < args->guest_paddr)
+		return -EINVAL;
+
+	rfile = fget(args->rfd);
+	if (!rfile)
+		return -EBADF;
+	wfile = fget(args->wfd);
+	if (!wfile) {
+		fput(rfile);
+		return -EBADF;
+	}
+	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags & O_NONBLOCK)) {
+		ret = -EINVAL;
+		goto fail;
+	}
+	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->paddr = args->guest_paddr;
+	p->size = args->memory_size;
+	p->user_data = args->user_data;
+	p->rf = rfile;
+	p->wf = wfile;
+	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
+	bus_idx = get_bus_from_flags(args->flags);
+
+	mutex_lock(&kvm->slots_lock);
+
+	if (ioregion_collision(kvm, p, bus_idx)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+	kvm_iodevice_init(&p->dev, &ioregion_ops);
+	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
+				      &p->dev);
+	if (ret < 0)
+		goto unlock_fail;
+	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return 0;
+
+unlock_fail:
+	mutex_unlock(&kvm->slots_lock);
+	kfree(p);
+fail:
+	fput(rfile);
+	fput(wfile);
+
+	return ret;
+}
+
+static int
+kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	struct ioregion         *p, *tmp;
+	enum kvm_bus             bus_idx;
+	int                      ret = -ENOENT;
+	struct list_head        *ioregions;
+
+	if (args->rfd != -1 || args->wfd != -1)
+		return -EINVAL;
+
+	bus_idx = get_bus_from_flags(args->flags);
+	ioregions = get_ioregion_list(kvm, bus_idx);
+
+	mutex_lock(&kvm->slots_lock);
+
+	list_for_each_entry_safe(p, tmp, ioregions, list) {
+		if (p->paddr == args->guest_paddr  &&
+		    p->size == args->memory_size) {
+			kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+			ioregion_release(p);
+			ret = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+
+int
+kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
+{
+	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
+		return -EINVAL;
+	if (args->rfd == -1 || args->wfd == -1)
+		return kvm_rm_ioregion(kvm, args);
+
+	return kvm_set_ioregion(kvm, args);
+}
diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
new file mode 100644
index 000000000000..23ffa812ec7a
--- /dev/null
+++ b/virt/kvm/ioregion.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_IOREGION_H__
+#define __KVM_IOREGION_H__
+
+#ifdef CONFIG_KVM_IOREGION
+inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
+bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
+#else
+static inline bool
+kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
+{
+	return false;
+}
+#endif
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..88b92fc3da51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
+	kvm_ioregionfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
@@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
 		break;
 	}
+	case KVM_SET_IOREGION: {
+		struct kvm_ioregion data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof(data)))
+			goto out;
+
+		r = kvm_ioregionfd(kvm, &data);
+		break;
+	}
 	case KVM_GET_DIRTY_LOG: {
 		struct kvm_dirty_log log;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
                   ` (3 preceding siblings ...)
  2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
@ 2021-01-30 14:56 ` Stefan Hajnoczi
  2021-02-02 14:59 ` Stefan Hajnoczi
  2021-02-08  6:02 ` Jason Wang
  6 siblings, 0 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-30 14:56 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 1227 bytes --]

On Thu, Jan 28, 2021 at 09:32:19PM +0300, Elena Afanasova wrote:
> This patchset introduces a KVM dispatch mechanism which can be used 
> for handling MMIO/PIO accesses over file descriptors without returning 
> from ioctl(KVM_RUN). This allows device emulation to run in another task 
> separate from the vCPU task.
> 
> This is achieved through KVM vm ioctl for registering MMIO/PIO regions and 
> a wire protocol that KVM uses to communicate with a task handling an 
> MMIO/PIO access.
> 
> TODOs:
> * Implement KVM_EXIT_IOREGIONFD_FAILURE
> * Add non-x86 arch support

This is an interesting topic for discussion with the KVM maintainers.
The ioctl(KVM_RUN) exit code is arch-specific in the sense that there is
no standard approach for PIO/MMIO accesses to return to userspace and
resume processing when ioctl(KVM_RUN) is called again.

This RFC series is x86-specific, but part of it can be made cross-arch
by introducing a core KVM ->complete_user_exit() function pointer that
MMIO/PIO and other users (EINTR?) can use to resume processing when
ioctl(KVM_RUN) is called again.

Maybe the benefit for a generic ->complete_user_exit() function is too
small so each arch open codes this behavior?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
@ 2021-01-30 15:04   ` Stefan Hajnoczi
  2021-02-04 13:03   ` Cornelia Huck
  2021-02-08  6:21   ` Jason Wang
  2 siblings, 0 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-30 15:04 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 522 bytes --]

On Fri, Jan 29, 2021 at 09:48:26PM +0300, Elena Afanasova wrote:
> This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> read and write accesses are dispatched through the given ioregionfd
> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> setting fds to -1.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>   - changes after code review

Please try to be more specific in future revisions so reviewers know
exactly what to look out for.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
@ 2021-01-30 16:58   ` Stefan Hajnoczi
  2021-02-03 14:00     ` Elena Afanasova
  2021-02-09  6:21   ` Jason Wang
  2021-02-09  6:26   ` Jason Wang
  2 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-30 16:58 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 6275 bytes --]

On Thu, Jan 28, 2021 at 09:32:21PM +0300, Elena Afanasova wrote:
> The vCPU thread may receive a signal during ioregionfd communication,
> ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> must resume ioregionfd.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>   - add support for x86 signal handling
>   - changes after code review
> 
>  arch/x86/kvm/x86.c            | 196 +++++++++++++++++++++++++++++++---
>  include/linux/kvm_host.h      |  13 +++
>  include/uapi/linux/ioregion.h |  32 ++++++
>  virt/kvm/ioregion.c           | 177 +++++++++++++++++++++++++++++-
>  virt/kvm/kvm_main.c           |  16 ++-
>  5 files changed, 415 insertions(+), 19 deletions(-)
>  create mode 100644 include/uapi/linux/ioregion.h
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ddb28f5ca252..a04516b531da 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>  {
>  	int handled = 0;
>  	int n;
> +	int ret = 0;
> +	bool is_apic;
>  
>  	do {
>  		n = min(len, 8);
> -		if (!(lapic_in_kernel(vcpu) &&
> -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
> -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
> -			break;
> +		is_apic = lapic_in_kernel(vcpu) &&
> +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> +					      addr, n, v);
> +		if (!is_apic) {
> +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> +					       addr, n, v);
> +			if (ret)
> +				break;
> +		}
>  		handled += n;
>  		addr += n;
>  		len -= n;
>  		v += n;
>  	} while (len);
>  
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +	}
> +#endif
> +
>  	return handled;
>  }

There is a special case for crossing page boundaries:
1. ioregion in the first 4 bytes (page 1) but not the second 4 bytes (page 2).
2. ioregion in the second 4 bytes (page 2) but not the first 4 bytes (page 1).
3. The first 4 bytes (page 1) in one ioregion and the second 4 bytes (page 2) in another ioregion.
4. The first 4 bytes (page 1) in one ioregion and the second 4 bytes (page 2) in the same ioregion.

Cases 3 and 4 are tricky. If I'm reading the code correctly we try
ioregion accesses twice, even if the first one returns -EINTR?

> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7cd667dddba9..5cfdecfca6db 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -318,6 +318,19 @@ struct kvm_vcpu {
>  #endif
>  	bool preempted;
>  	bool ready;
> +#ifdef CONFIG_KVM_IOREGION
> +	bool ioregion_interrupted;

Can this field move into ioregion_ctx?

> +	struct {
> +		struct kvm_io_device *dev;
> +		int pio;
> +		void *val;
> +		u8 state;
> +		u64 addr;
> +		int len;
> +		u64 data;
> +		bool in;
> +	} ioregion_ctx;

This struct can be reordered to remove holes between fields.

> +#endif
>  	struct kvm_vcpu_arch arch;
>  };
>  
> diff --git a/include/uapi/linux/ioregion.h b/include/uapi/linux/ioregion.h
> new file mode 100644
> index 000000000000..7898c01f84a1
> --- /dev/null
> +++ b/include/uapi/linux/ioregion.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */

To encourage people to implement the wire protocol even beyond the Linux
syscall environment (e.g. in other hypervisors and VMMs) you could make
the license more permissive:

  /* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */

Several other <linux/*.h> files do this so that the header can be used
outside Linux without license concerns.

Here is the BSD 3-Clause license:
https://opensource.org/licenses/BSD-3-Clause

> +#ifndef _UAPI_LINUX_IOREGION_H
> +#define _UAPI_LINUX_IOREGION_H

Please add the wire protocol specification/documentation into this file.
That way this header file will serve as a comprehensive reference for
the protocol and changes to the header will also update the
documentation.

(The ioctl KVM_SET_IOREGIONFD parts belong in
Documentation/virt/kvm/api.rst but the wire protocol should be in this
header file instead.)

> +
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +
> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};
> +
> +#define IOREGIONFD_CMD_READ    0
> +#define IOREGIONFD_CMD_WRITE   1
> +
> +#define IOREGIONFD_SIZE_8BIT   0
> +#define IOREGIONFD_SIZE_16BIT  1
> +#define IOREGIONFD_SIZE_32BIT  2
> +#define IOREGIONFD_SIZE_64BIT  3

It's possible that larger read/write operations will be needed in the
future. For example, the PCI Express bus supports much larger
transactions than just 64 bits.

You don't need to address this right now but I wanted to mention it.

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 88b92fc3da51..df387857f51f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>  			      struct kvm_io_range *range, const void *val)
>  {
>  	int idx;
> +	int ret = 0;
>  
>  	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
>  	if (idx < 0)
> @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>  
>  	while (idx < bus->dev_count &&
>  		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> -					range->len, val))
> +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> +					 range->len, val);
> +		if (!ret)
>  			return idx;
> +		if (ret < 0 && ret != -EOPNOTSUPP)
> +			return ret;

I audited all kvm_io_bus_read/write() callers to check that it's safe to
add error return values besides -EOPNOTSUPP. Extending the meaning of
the return value is fine but any arches that want to support ioregionfd
need to explicitly handle -EINTR return values now. Only x86 does after
this patch.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization
  2021-01-28 18:32 ` [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization Elena Afanasova
@ 2021-01-30 18:54   ` Stefan Hajnoczi
  2021-02-03 14:10     ` Elena Afanasova
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-01-30 18:54 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 5384 bytes --]

On Thu, Jan 28, 2021 at 09:32:22PM +0300, Elena Afanasova wrote:
> Add ioregionfd context and kvm_io_device_ops->prepare/finish()
> in order to serialize all bytes requested by guest.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
>  arch/x86/kvm/x86.c       |  19 ++++++++
>  include/kvm/iodev.h      |  14 ++++++
>  include/linux/kvm_host.h |   4 ++
>  virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++------
>  virt/kvm/kvm_main.c      |  32 ++++++++++++
>  5 files changed, 157 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a04516b531da..393fb0f4bf46 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5802,6 +5802,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>  	int ret = 0;
>  	bool is_apic;
>  
> +	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
> +
>  	do {
>  		n = min(len, 8);
>  		is_apic = lapic_in_kernel(vcpu) &&
> @@ -5823,8 +5825,10 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>  	if (ret == -EINTR) {
>  		vcpu->run->exit_reason = KVM_EXIT_INTR;
>  		++vcpu->stat.signal_exits;
> +		return handled;
>  	}
>  #endif
> +	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);

Hmm...it would be nice for kvm_io_bus_prepare() to return the idx or the
device pointer so the devices don't need to be searched in
read/write/finish. However, it's complicated by the loop which may
access multiple devices.

> @@ -9309,6 +9325,7 @@ static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
>  		vcpu->mmio_cur_fragment++;
>  	}
>  
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	vcpu->mmio_needed = 0;
>  	if (!vcpu->ioregion_ctx.in) {
>  		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -9333,6 +9350,7 @@ static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
>  		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
>  	}
>  
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	if (vcpu->ioregion_ctx.in)
>  		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -9352,6 +9370,7 @@ static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
>  	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
>  				 vcpu->ioregion_ctx.len,
>  				 vcpu->ioregion_ctx.val);
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
>  
>  	if (vcpu->ioregion_ctx.in) {

Normally userspace will invoke ioctl(KVM_RUN) and reach one of these
completion functions, but what if the vcpu fd is closed instead?
->finish() should still be called to avoid leaks.

> diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
> index d75fc4365746..db8a3c69b7bb 100644
> --- a/include/kvm/iodev.h
> +++ b/include/kvm/iodev.h
> @@ -25,6 +25,8 @@ struct kvm_io_device_ops {
>  		     gpa_t addr,
>  		     int len,
>  		     const void *val);
> +	void (*prepare)(struct kvm_io_device *this);
> +	void (*finish)(struct kvm_io_device *this);
>  	void (*destructor)(struct kvm_io_device *this);
>  };
>  
> @@ -55,6 +57,18 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
>  				 : -EOPNOTSUPP;
>  }
>  
> +static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
> +{
> +	if (dev->ops->prepare)
> +		dev->ops->prepare(dev);
> +}
> +
> +static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
> +{
> +	if (dev->ops->finish)
> +		dev->ops->finish(dev);
> +}

A performance optimization: keep a separate list of struct
kvm_io_devices that implement prepare/finish. That way the search
doesn't need to iterate over devices that don't support this interface.

Before implementing an optimization like this it would be good to check
how this patch affects performance on guests with many in-kernel devices
(e.g. a guest that has many multi-queue virtio-net/blk devices with
ioeventfd). ioregionfd shouldn't reduce performance of existing KVM
configurations, so it's worth measuring.

> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index da38124e1418..3474090ccc8c 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -1,6 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  #include <linux/kvm_host.h>
> -#include <linux/fs.h>
> +#include <linux/wait.h>
>  #include <kvm/iodev.h>
>  #include "eventfd.h"
>  #include <uapi/linux/ioregion.h>
> @@ -12,15 +12,23 @@ kvm_ioregionfd_init(struct kvm *kvm)
>  	INIT_LIST_HEAD(&kvm->ioregions_pio);
>  }
>  
> +/* Serializes ioregionfd cmds/replies */

Please expand on this comment:

  ioregions that share the same rfd are serialized so that only one vCPU
  thread sends a struct ioregionfd_cmd to userspace at a time. This
  ensures that the struct ioregionfd_resp received from userspace will
  be processed by the one and only vCPU thread that sent it.

  A waitqueue is used to wake up waiting vCPU threads in order. Most of
  the time the waitqueue is unused and the lock is not contended.
  For best performance userspace should set up ioregionfds so that there
  is no contention (e.g. dedicated ioregionfds for queue doorbell
  registers on multi-queue devices).

A comment along these lines will give readers an idea of why the code
does this.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
                   ` (4 preceding siblings ...)
  2021-01-30 14:56 ` [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
@ 2021-02-02 14:59 ` Stefan Hajnoczi
  2021-02-08  6:02 ` Jason Wang
  6 siblings, 0 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-02-02 14:59 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: kvm, jag.raman, elena.ufimtseva, Michael S. Tsirkin, jasowang,
	John Levon, Elena Afanasova

[-- Attachment #1: Type: text/plain, Size: 1984 bytes --]

On Thu, Jan 28, 2021 at 09:32:19PM +0300, Elena Afanasova wrote:
> This patchset introduces a KVM dispatch mechanism which can be used 
> for handling MMIO/PIO accesses over file descriptors without returning 
> from ioctl(KVM_RUN). This allows device emulation to run in another task 
> separate from the vCPU task.
> 
> This is achieved through KVM vm ioctl for registering MMIO/PIO regions and 
> a wire protocol that KVM uses to communicate with a task handling an 
> MMIO/PIO access.

Hi Paolo,
This patch series makes changes to the somewhat tricky x86 MMIO/PIO code
and introduces a new ioctl(KVM_RUN) EINTR case. Please take a look if
you have time!

Reviews from anyone else are appreciated too!

Thanks,
Stefan

> 
> TODOs:
> * Implement KVM_EXIT_IOREGIONFD_FAILURE
> * Add non-x86 arch support
> * Add kvm-unittests
> 
> Elena Afanasova (4):
>   KVM: add initial support for KVM_SET_IOREGION
>   KVM: x86: add support for ioregionfd signal handling
>   KVM: add support for ioregionfd cmds/replies serialization
>   KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled
> 
>  arch/x86/kvm/Kconfig          |   1 +
>  arch/x86/kvm/Makefile         |   1 +
>  arch/x86/kvm/x86.c            | 216 ++++++++++++++-
>  include/kvm/iodev.h           |  14 +
>  include/linux/kvm_host.h      |  34 +++
>  include/uapi/linux/ioregion.h |  32 +++
>  include/uapi/linux/kvm.h      |  23 ++
>  virt/kvm/Kconfig              |   3 +
>  virt/kvm/eventfd.c            |  25 ++
>  virt/kvm/eventfd.h            |  14 +
>  virt/kvm/ioregion.c           | 479 ++++++++++++++++++++++++++++++++++
>  virt/kvm/ioregion.h           |  15 ++
>  virt/kvm/kvm_main.c           |  68 ++++-
>  13 files changed, 905 insertions(+), 20 deletions(-)
>  create mode 100644 include/uapi/linux/ioregion.h
>  create mode 100644 virt/kvm/eventfd.h
>  create mode 100644 virt/kvm/ioregion.c
>  create mode 100644 virt/kvm/ioregion.h
> 
> -- 
> 2.25.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-01-30 16:58   ` Stefan Hajnoczi
@ 2021-02-03 14:00     ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-02-03 14:00 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kvm, jag.raman, elena.ufimtseva

On Sat, 2021-01-30 at 16:58 +0000, Stefan Hajnoczi wrote:
> On Thu, Jan 28, 2021 at 09:32:21PM +0300, Elena Afanasova wrote:
> > The vCPU thread may receive a signal during ioregionfd
> > communication,
> > ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> > must resume ioregionfd.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> > Changes in v2:
> >   - add support for x86 signal handling
> >   - changes after code review
> > 
> >  arch/x86/kvm/x86.c            | 196
> > +++++++++++++++++++++++++++++++---
> >  include/linux/kvm_host.h      |  13 +++
> >  include/uapi/linux/ioregion.h |  32 ++++++
> >  virt/kvm/ioregion.c           | 177 +++++++++++++++++++++++++++++-
> >  virt/kvm/kvm_main.c           |  16 ++-
> >  5 files changed, 415 insertions(+), 19 deletions(-)
> >  create mode 100644 include/uapi/linux/ioregion.h
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index ddb28f5ca252..a04516b531da 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >  {
> >  	int handled = 0;
> >  	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >  
> >  	do {
> >  		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> > addr, n, v))
> > -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n,
> > v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic-
> > >dev,
> > +					      addr, n, v);
> > +		if (!is_apic) {
> > +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> > +					       addr, n, v);
> > +			if (ret)
> > +				break;
> > +		}
> >  		handled += n;
> >  		addr += n;
> >  		len -= n;
> >  		v += n;
> >  	} while (len);
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +	}
> > +#endif
> > +
> >  	return handled;
> >  }
> 
> There is a special case for crossing page boundaries:
> 1. ioregion in the first 4 bytes (page 1) but not the second 4 bytes
> (page 2).
> 2. ioregion in the second 4 bytes (page 2) but not the first 4 bytes
> (page 1).
> 3. The first 4 bytes (page 1) in one ioregion and the second 4 bytes
> (page 2) in another ioregion.
> 4. The first 4 bytes (page 1) in one ioregion and the second 4 bytes
> (page 2) in the same ioregion.
> 
> Cases 3 and 4 are tricky. If I'm reading the code correctly we try
> ioregion accesses twice, even if the first one returns -EINTR?
> 
Yes, in the case of crossing a page boundary
emulator_read_write_onepage() will be called twice. This case isn’t
supported in the current code. Also I think that synchronization code
for vcpu_mmio_write/read() is wrong. Probably
kvm_io_bus_prepare/finish() should be called for every
kvm_io_bus_read/write(). I'll try to fix that.

> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7cd667dddba9..5cfdecfca6db 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -318,6 +318,19 @@ struct kvm_vcpu {
> >  #endif
> >  	bool preempted;
> >  	bool ready;
> > +#ifdef CONFIG_KVM_IOREGION
> > +	bool ioregion_interrupted;
> 
> Can this field move into ioregion_ctx?
> 
Yes

> > +	struct {
> > +		struct kvm_io_device *dev;
> > +		int pio;
> > +		void *val;
> > +		u8 state;
> > +		u64 addr;
> > +		int len;
> > +		u64 data;
> > +		bool in;
> > +	} ioregion_ctx;
> 
> This struct can be reordered to remove holes between fields.
> 
Ok, will do

> > +#endif
> >  	struct kvm_vcpu_arch arch;
> >  };
> >  
> > diff --git a/include/uapi/linux/ioregion.h
> > b/include/uapi/linux/ioregion.h
> > new file mode 100644
> > index 000000000000..7898c01f84a1
> > --- /dev/null
> > +++ b/include/uapi/linux/ioregion.h
> > @@ -0,0 +1,32 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> 
> To encourage people to implement the wire protocol even beyond the
> Linux
> syscall environment (e.g. in other hypervisors and VMMs) you could
> make
> the license more permissive:
> 
>   /* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) 
> OR BSD-3-Clause) */
> 
> Several other <linux/*.h> files do this so that the header can be
> used
> outside Linux without license concerns.
> 
> Here is the BSD 3-Clause license:
> https://opensource.org/licenses/BSD-3-Clause
> 
> > +#ifndef _UAPI_LINUX_IOREGION_H
> > +#define _UAPI_LINUX_IOREGION_H
> 
> Please add the wire protocol specification/documentation into this
> file.
> That way this header file will serve as a comprehensive reference for
> the protocol and changes to the header will also update the
> documentation.
> 
> (The ioctl KVM_SET_IOREGIONFD parts belong in
> Documentation/virt/kvm/api.rst but the wire protocol should be in
> this
> header file instead.)
> 
Ok

> > +
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};
> > +
> > +#define IOREGIONFD_CMD_READ    0
> > +#define IOREGIONFD_CMD_WRITE   1
> > +
> > +#define IOREGIONFD_SIZE_8BIT   0
> > +#define IOREGIONFD_SIZE_16BIT  1
> > +#define IOREGIONFD_SIZE_32BIT  2
> > +#define IOREGIONFD_SIZE_64BIT  3
> 
> It's possible that larger read/write operations will be needed in the
> future. For example, the PCI Express bus supports much larger
> transactions than just 64 bits.
> 
> You don't need to address this right now but I wanted to mention it.
> 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 88b92fc3da51..df387857f51f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >  			      struct kvm_io_range *range, const void
> > *val)
> >  {
> >  	int idx;
> > +	int ret = 0;
> >  
> >  	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
> >  	if (idx < 0)
> > @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct
> > kvm_vcpu *vcpu, struct kvm_io_bus *bus,
> >  
> >  	while (idx < bus->dev_count &&
> >  		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> > -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > -					range->len, val))
> > +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > +					 range->len, val);
> > +		if (!ret)
> >  			return idx;
> > +		if (ret < 0 && ret != -EOPNOTSUPP)
> > +			return ret;
> 
> I audited all kvm_io_bus_read/write() callers to check that it's safe
> to
> add error return values besides -EOPNOTSUPP. Extending the meaning of
> the return value is fine but any arches that want to support
> ioregionfd
> need to explicitly handle -EINTR return values now. Only x86 does
> after
> this patch.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization
  2021-01-30 18:54   ` Stefan Hajnoczi
@ 2021-02-03 14:10     ` Elena Afanasova
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-02-03 14:10 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kvm, jag.raman, elena.ufimtseva

On Sat, 2021-01-30 at 18:54 +0000, Stefan Hajnoczi wrote:
> On Thu, Jan 28, 2021 at 09:32:22PM +0300, Elena Afanasova wrote:
> > Add ioregionfd context and kvm_io_device_ops->prepare/finish()
> > in order to serialize all bytes requested by guest.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> >  arch/x86/kvm/x86.c       |  19 ++++++++
> >  include/kvm/iodev.h      |  14 ++++++
> >  include/linux/kvm_host.h |   4 ++
> >  virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++
> > ------
> >  virt/kvm/kvm_main.c      |  32 ++++++++++++
> >  5 files changed, 157 insertions(+), 14 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index a04516b531da..393fb0f4bf46 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5802,6 +5802,8 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >  	int ret = 0;
> >  	bool is_apic;
> >  
> > +	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
> > +
> >  	do {
> >  		n = min(len, 8);
> >  		is_apic = lapic_in_kernel(vcpu) &&
> > @@ -5823,8 +5825,10 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >  	if (ret == -EINTR) {
> >  		vcpu->run->exit_reason = KVM_EXIT_INTR;
> >  		++vcpu->stat.signal_exits;
> > +		return handled;
> >  	}
> >  #endif
> > +	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
> 
> Hmm...it would be nice for kvm_io_bus_prepare() to return the idx or
> the
> device pointer so the devices don't need to be searched in
> read/write/finish. However, it's complicated by the loop which may
> access multiple devices.
> 
Agree

> > @@ -9309,6 +9325,7 @@ static int complete_ioregion_mmio(struct
> > kvm_vcpu *vcpu)
> >  		vcpu->mmio_cur_fragment++;
> >  	}
> >  
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	vcpu->mmio_needed = 0;
> >  	if (!vcpu->ioregion_ctx.in) {
> >  		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > @@ -9333,6 +9350,7 @@ static int complete_ioregion_pio(struct
> > kvm_vcpu *vcpu)
> >  		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> >  	}
> >  
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	if (vcpu->ioregion_ctx.in)
> >  		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > @@ -9352,6 +9370,7 @@ static int complete_ioregion_fast_pio(struct
> > kvm_vcpu *vcpu)
> >  	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> >  				 vcpu->ioregion_ctx.len,
> >  				 vcpu->ioregion_ctx.val);
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> >  
> >  	if (vcpu->ioregion_ctx.in) {
> 
> Normally userspace will invoke ioctl(KVM_RUN) and reach one of these
> completion functions, but what if the vcpu fd is closed instead?
> ->finish() should still be called to avoid leaks.
> 
Will fix

> > diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
> > index d75fc4365746..db8a3c69b7bb 100644
> > --- a/include/kvm/iodev.h
> > +++ b/include/kvm/iodev.h
> > @@ -25,6 +25,8 @@ struct kvm_io_device_ops {
> >  		     gpa_t addr,
> >  		     int len,
> >  		     const void *val);
> > +	void (*prepare)(struct kvm_io_device *this);
> > +	void (*finish)(struct kvm_io_device *this);
> >  	void (*destructor)(struct kvm_io_device *this);
> >  };
> >  
> > @@ -55,6 +57,18 @@ static inline int kvm_iodevice_write(struct
> > kvm_vcpu *vcpu,
> >  				 : -EOPNOTSUPP;
> >  }
> >  
> > +static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
> > +{
> > +	if (dev->ops->prepare)
> > +		dev->ops->prepare(dev);
> > +}
> > +
> > +static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
> > +{
> > +	if (dev->ops->finish)
> > +		dev->ops->finish(dev);
> > +}
> 
> A performance optimization: keep a separate list of struct
> kvm_io_devices that implement prepare/finish. That way the search
> doesn't need to iterate over devices that don't support this
> interface.
> 
Thanks for the idea

> Before implementing an optimization like this it would be good to
> check
> how this patch affects performance on guests with many in-kernel
> devices
> (e.g. a guest that has many multi-queue virtio-net/blk devices with
> ioeventfd). ioregionfd shouldn't reduce performance of existing KVM
> configurations, so it's worth measuring.
> 
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index da38124e1418..3474090ccc8c 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -1,6 +1,6 @@
> >  // SPDX-License-Identifier: GPL-2.0-only
> >  #include <linux/kvm_host.h>
> > -#include <linux/fs.h>
> > +#include <linux/wait.h>
> >  #include <kvm/iodev.h>
> >  #include "eventfd.h"
> >  #include <uapi/linux/ioregion.h>
> > @@ -12,15 +12,23 @@ kvm_ioregionfd_init(struct kvm *kvm)
> >  	INIT_LIST_HEAD(&kvm->ioregions_pio);
> >  }
> >  
> > +/* Serializes ioregionfd cmds/replies */
> 
> Please expand on this comment:
> 
>   ioregions that share the same rfd are serialized so that only one
> vCPU
>   thread sends a struct ioregionfd_cmd to userspace at a time. This
>   ensures that the struct ioregionfd_resp received from userspace
> will
>   be processed by the one and only vCPU thread that sent it.
> 
>   A waitqueue is used to wake up waiting vCPU threads in order. Most
> of
>   the time the waitqueue is unused and the lock is not contended.
>   For best performance userspace should set up ioregionfds so that
> there
>   is no contention (e.g. dedicated ioregionfds for queue doorbell
>   registers on multi-queue devices).
> 
> A comment along these lines will give readers an idea of why the code
> does this.

Ok, thank you


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
  2021-01-30 15:04   ` Stefan Hajnoczi
@ 2021-02-04 13:03   ` Cornelia Huck
  2021-02-05 18:39     ` Elena Afanasova
  2021-02-08  6:21   ` Jason Wang
  2 siblings, 1 reply; 28+ messages in thread
From: Cornelia Huck @ 2021-02-04 13:03 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, stefanha, jag.raman, elena.ufimtseva

On Fri, 29 Jan 2021 21:48:26 +0300
Elena Afanasova <eafanasova@gmail.com> wrote:

[Note: I've just started looking at this, please excuse any questions
that have already been answered elsewhere.]

> This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> read and write accesses are dispatched through the given ioregionfd
> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> setting fds to -1.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>   - changes after code review
> 
>  arch/x86/kvm/Kconfig     |   1 +
>  arch/x86/kvm/Makefile    |   1 +
>  arch/x86/kvm/x86.c       |   1 +
>  include/linux/kvm_host.h |  17 +++
>  include/uapi/linux/kvm.h |  23 ++++
>  virt/kvm/Kconfig         |   3 +
>  virt/kvm/eventfd.c       |  25 +++++
>  virt/kvm/eventfd.h       |  14 +++
>  virt/kvm/ioregion.c      | 232 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/ioregion.h      |  15 +++
>  virt/kvm/kvm_main.c      |  11 ++
>  11 files changed, 343 insertions(+)
>  create mode 100644 virt/kvm/eventfd.h
>  create mode 100644 virt/kvm/ioregion.c
>  create mode 100644 virt/kvm/ioregion.h

(...)

> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ca41220b40b8..81e775778c66 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>  	__u8  pad[36];
>  };
>  
> +enum {
> +	kvm_ioregion_flag_nr_pio,
> +	kvm_ioregion_flag_nr_posted_writes,
> +	kvm_ioregion_flag_nr_max,
> +};
> +
> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
> +
> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
> +
> +struct kvm_ioregion {
> +	__u64 guest_paddr; /* guest physical address */
> +	__u64 memory_size; /* bytes */
> +	__u64 user_data;
> +	__s32 rfd;
> +	__s32 wfd;

I guess these are read and write file descriptors? Maybe call them
read_fd and write_fd?

> +	__u32 flags;
> +	__u8  pad[28];
> +};
> +
>  #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_X86_USER_SPACE_MSR 188
>  #define KVM_CAP_X86_MSR_FILTER 189
>  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> +#define KVM_CAP_IOREGIONFD 191
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>  					struct kvm_userspace_memory_region)
>  #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)

This new ioctl needs some documentation under
Documentation/virt/kvm/api.rst. (That would also make review easier.)

>  
>  /* enable ucontrol for s390 */
>  struct kvm_s390_ucas_mapping {

(...)

> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index c2323c27a28b..aadb73903f8b 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -27,6 +27,7 @@
>  #include <trace/events/kvm.h>
>  
>  #include <kvm/iodev.h>
> +#include "ioregion.h"
>  
>  #ifdef CONFIG_HAVE_KVM_IRQFD
>  
> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
>  	.destructor = ioeventfd_destructor,
>  };
>  
> +#ifdef CONFIG_KVM_IOREGION
> +/* assumes kvm->slots_lock held */
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> +			  u64 start, u64 size)
> +{
> +	struct _ioeventfd *_p;
> +
> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> +		if (_p->bus_idx == bus_idx &&
> +		    overlap(start, size, _p->addr,
> +			    !_p->length ? 8 : _p->length))

Not a problem right now, as this is x86 only, but I'm not sure we can
define "overlap" in a meaningful way for every bus_idx. (For example,
the s390-only ccw notifications use addr to identify a device; as long
as addr is unique, there will be no clash. I'm not sure yet if
ioregions are usable for ccw devices, and if yes, in which form, but we
should probably keep it in mind.)

> +			return true;
> +
> +	return false;
> +}
> +#endif
> +
>  /* assumes kvm->slots_lock held */
>  static bool
>  ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>  		       _p->datamatch == p->datamatch))))
>  			return true;
>  
> +#ifdef CONFIG_KVM_IOREGION
> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> +					  !p->length ? 8 : p->length))

What about KVM_FAST_MMIO_BUS?

> +			return true;
> +#endif
> +
>  	return false;
>  }
>  

(...)

> +/* check for not overlapping case and reverse */
> +inline bool
> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> +{
> +	u64 end1 = start1 + size1 - 1;
> +	u64 end2 = start2 + size2 - 1;
> +
> +	return !(end1 < start2 || start1 >= end2);
> +}

I'm wondering whether there's already a generic function to do a check
like this?

(...)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-04 13:03   ` Cornelia Huck
@ 2021-02-05 18:39     ` Elena Afanasova
  2021-02-08 11:49       ` Cornelia Huck
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Afanasova @ 2021-02-05 18:39 UTC (permalink / raw)
  To: Cornelia Huck; +Cc: kvm, stefanha, jag.raman, elena.ufimtseva

On Thu, 2021-02-04 at 14:03 +0100, Cornelia Huck wrote:
> On Fri, 29 Jan 2021 21:48:26 +0300
> Elena Afanasova <eafanasova@gmail.com> wrote:
> 
> [Note: I've just started looking at this, please excuse any questions
> that have already been answered elsewhere.]
> 
> > This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> > read and write accesses are dispatched through the given ioregionfd
> > instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> > setting fds to -1.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> > Changes in v2:
> >   - changes after code review
> > 
> >  arch/x86/kvm/Kconfig     |   1 +
> >  arch/x86/kvm/Makefile    |   1 +
> >  arch/x86/kvm/x86.c       |   1 +
> >  include/linux/kvm_host.h |  17 +++
> >  include/uapi/linux/kvm.h |  23 ++++
> >  virt/kvm/Kconfig         |   3 +
> >  virt/kvm/eventfd.c       |  25 +++++
> >  virt/kvm/eventfd.h       |  14 +++
> >  virt/kvm/ioregion.c      | 232
> > +++++++++++++++++++++++++++++++++++++++
> >  virt/kvm/ioregion.h      |  15 +++
> >  virt/kvm/kvm_main.c      |  11 ++
> >  11 files changed, 343 insertions(+)
> >  create mode 100644 virt/kvm/eventfd.h
> >  create mode 100644 virt/kvm/ioregion.c
> >  create mode 100644 virt/kvm/ioregion.h
> 
> (...)
> 
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index ca41220b40b8..81e775778c66 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> >  	__u8  pad[36];
> >  };
> >  
> > +enum {
> > +	kvm_ioregion_flag_nr_pio,
> > +	kvm_ioregion_flag_nr_posted_writes,
> > +	kvm_ioregion_flag_nr_max,
> > +};
> > +
> > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > kvm_ioregion_flag_nr_posted_writes)
> > +
> > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > kvm_ioregion_flag_nr_max) - 1)
> > +
> > +struct kvm_ioregion {
> > +	__u64 guest_paddr; /* guest physical address */
> > +	__u64 memory_size; /* bytes */
> > +	__u64 user_data;
> > +	__s32 rfd;
> > +	__s32 wfd;
> 
> I guess these are read and write file descriptors? 
Yes

> Maybe call them read_fd and write_fd?
> 
Ok

> > +	__u32 flags;
> > +	__u8  pad[28];
> > +};
> > +
> >  #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> >  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> >  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_X86_USER_SPACE_MSR 188
> >  #define KVM_CAP_X86_MSR_FILTER 189
> >  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > +#define KVM_CAP_IOREGIONFD 191
> >  
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >  
> > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> >  					struct
> > kvm_userspace_memory_region)
> >  #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> >  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > kvm_ioregion)
> 
> This new ioctl needs some documentation under
> Documentation/virt/kvm/api.rst. (That would also make review easier.)
> 
Agreed. The latest version of the ioregionfd API can be found in 
https://marc.info/?l=kvm&m=160633710708172&w=2. There are still some
open questions like write coalescing support.  So I think API may still
be changed during code reviews.

> >  
> >  /* enable ucontrol for s390 */
> >  struct kvm_s390_ucas_mapping {
> 
> (...)
> 
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index c2323c27a28b..aadb73903f8b 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -27,6 +27,7 @@
> >  #include <trace/events/kvm.h>
> >  
> >  #include <kvm/iodev.h>
> > +#include "ioregion.h"
> >  
> >  #ifdef CONFIG_HAVE_KVM_IRQFD
> >  
> > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > ioeventfd_ops = {
> >  	.destructor = ioeventfd_destructor,
> >  };
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +/* assumes kvm->slots_lock held */
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > +			  u64 start, u64 size)
> > +{
> > +	struct _ioeventfd *_p;
> > +
> > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > +		if (_p->bus_idx == bus_idx &&
> > +		    overlap(start, size, _p->addr,
> > +			    !_p->length ? 8 : _p->length))
> 
> Not a problem right now, as this is x86 only, but I'm not sure we can
> define "overlap" in a meaningful way for every bus_idx. (For example,
> the s390-only ccw notifications use addr to identify a device; as
> long
> as addr is unique, there will be no clash. I'm not sure yet if
> ioregions are usable for ccw devices, and if yes, in which form, but
> we
> should probably keep it in mind.)
> 
Thank you for pointing it out. Yes, CCW bus seems to be a special case.

> > +			return true;
> > +
> > +	return false;
> > +}
> > +#endif
> > +
> >  /* assumes kvm->slots_lock held */
> >  static bool
> >  ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > struct _ioeventfd *p)
> >  		       _p->datamatch == p->datamatch))))
> >  			return true;
> >  
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> > +					  !p->length ? 8 : p->length))
> 
> What about KVM_FAST_MMIO_BUS?
> 
Yes, we have already discussed FAST_MMIO support with Jason Wang. TODO.

> > +			return true;
> > +#endif
> > +
> >  	return false;
> >  }
> >  
> 
> (...)
> 
> > +/* check for not overlapping case and reverse */
> > +inline bool
> > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > +{
> > +	u64 end1 = start1 + size1 - 1;
> > +	u64 end2 = start2 + size2 - 1;
> > +
> > +	return !(end1 < start2 || start1 >= end2);
> > +}
> 
> I'm wondering whether there's already a generic function to do a
> check
> like this?
> 
I couldn't find it.

> (...)
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd)
  2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
                   ` (5 preceding siblings ...)
  2021-02-02 14:59 ` Stefan Hajnoczi
@ 2021-02-08  6:02 ` Jason Wang
  6 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-08  6:02 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/29 上午2:32, Elena Afanasova wrote:
> This patchset introduces a KVM dispatch mechanism which can be used
> for handling MMIO/PIO accesses over file descriptors without returning
> from ioctl(KVM_RUN). This allows device emulation to run in another task
> separate from the vCPU task.
>
> This is achieved through KVM vm ioctl for registering MMIO/PIO regions and
> a wire protocol that KVM uses to communicate with a task handling an
> MMIO/PIO access.
>
> TODOs:
> * Implement KVM_EXIT_IOREGIONFD_FAILURE
> * Add non-x86 arch support
> * Add kvm-unittests


It would be better to log the changes between versions to ease the 
reviewers.

Tanks


>
> Elena Afanasova (4):
>    KVM: add initial support for KVM_SET_IOREGION
>    KVM: x86: add support for ioregionfd signal handling
>    KVM: add support for ioregionfd cmds/replies serialization
>    KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled
>
>   arch/x86/kvm/Kconfig          |   1 +
>   arch/x86/kvm/Makefile         |   1 +
>   arch/x86/kvm/x86.c            | 216 ++++++++++++++-
>   include/kvm/iodev.h           |  14 +
>   include/linux/kvm_host.h      |  34 +++
>   include/uapi/linux/ioregion.h |  32 +++
>   include/uapi/linux/kvm.h      |  23 ++
>   virt/kvm/Kconfig              |   3 +
>   virt/kvm/eventfd.c            |  25 ++
>   virt/kvm/eventfd.h            |  14 +
>   virt/kvm/ioregion.c           | 479 ++++++++++++++++++++++++++++++++++
>   virt/kvm/ioregion.h           |  15 ++
>   virt/kvm/kvm_main.c           |  68 ++++-
>   13 files changed, 905 insertions(+), 20 deletions(-)
>   create mode 100644 include/uapi/linux/ioregion.h
>   create mode 100644 virt/kvm/eventfd.h
>   create mode 100644 virt/kvm/ioregion.c
>   create mode 100644 virt/kvm/ioregion.h
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
  2021-01-30 15:04   ` Stefan Hajnoczi
  2021-02-04 13:03   ` Cornelia Huck
@ 2021-02-08  6:21   ` Jason Wang
  2021-02-09 14:59     ` Stefan Hajnoczi
  2021-02-10 19:31     ` Elena Afanasova
  2 siblings, 2 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-08  6:21 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/30 上午2:48, Elena Afanasova wrote:
> This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> read and write accesses are dispatched through the given ioregionfd
> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> setting fds to -1.
>
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>    - changes after code review
>
>   arch/x86/kvm/Kconfig     |   1 +
>   arch/x86/kvm/Makefile    |   1 +
>   arch/x86/kvm/x86.c       |   1 +
>   include/linux/kvm_host.h |  17 +++
>   include/uapi/linux/kvm.h |  23 ++++
>   virt/kvm/Kconfig         |   3 +
>   virt/kvm/eventfd.c       |  25 +++++
>   virt/kvm/eventfd.h       |  14 +++
>   virt/kvm/ioregion.c      | 232 +++++++++++++++++++++++++++++++++++++++
>   virt/kvm/ioregion.h      |  15 +++
>   virt/kvm/kvm_main.c      |  11 ++
>   11 files changed, 343 insertions(+)
>   create mode 100644 virt/kvm/eventfd.h
>   create mode 100644 virt/kvm/ioregion.c
>   create mode 100644 virt/kvm/ioregion.h
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index f92dfd8ef10d..b914ef375199 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -33,6 +33,7 @@ config KVM
>   	select HAVE_KVM_IRQ_BYPASS
>   	select HAVE_KVM_IRQ_ROUTING
>   	select HAVE_KVM_EVENTFD
> +	select KVM_IOREGION
>   	select KVM_ASYNC_PF
>   	select USER_RETURN_NOTIFIER
>   	select KVM_MMIO
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index b804444e16d4..b3b17dc9f7d4 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>   kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>   				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
>   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>   
>   kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
>   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e545a8a613b1..ddb28f5ca252 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   	case KVM_CAP_X86_USER_SPACE_MSR:
>   	case KVM_CAP_X86_MSR_FILTER:
>   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> +	case KVM_CAP_IOREGIONFD:
>   		r = 1;
>   		break;
>   	case KVM_CAP_SYNC_REGS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7f2e2a09ebbd..7cd667dddba9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -470,6 +470,10 @@ struct kvm {
>   		struct mutex      resampler_lock;
>   	} irqfds;
>   	struct list_head ioeventfds;
> +#endif
> +#ifdef CONFIG_KVM_IOREGION
> +	struct list_head ioregions_mmio;
> +	struct list_head ioregions_pio;
>   #endif
>   	struct kvm_vm_stat stat;
>   	struct kvm_arch arch;
> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   
>   #endif /* CONFIG_HAVE_KVM_EVENTFD */
>   
> +#ifdef CONFIG_KVM_IOREGION
> +void kvm_ioregionfd_init(struct kvm *kvm);
> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> +
> +#else
> +
> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> +static inline int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	return -ENOSYS;
> +}
> +#endif
> +
>   void kvm_arch_irq_routing_update(struct kvm *kvm);
>   
>   static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ca41220b40b8..81e775778c66 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>   
> +enum {
> +	kvm_ioregion_flag_nr_pio,
> +	kvm_ioregion_flag_nr_posted_writes,
> +	kvm_ioregion_flag_nr_max,
> +};
> +
> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
> +
> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
> +
> +struct kvm_ioregion {
> +	__u64 guest_paddr; /* guest physical address */
> +	__u64 memory_size; /* bytes */


Do we really need __u64 here?


> +	__u64 user_data;
> +	__s32 rfd;
> +	__s32 wfd;
> +	__u32 flags;
> +	__u8  pad[28];
> +};
> +
>   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>   #define KVM_CAP_X86_USER_SPACE_MSR 188
>   #define KVM_CAP_X86_MSR_FILTER 189
>   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> +#define KVM_CAP_IOREGIONFD 191
>   
>   #ifdef KVM_CAP_IRQ_ROUTING
>   
> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>   					struct kvm_userspace_memory_region)
>   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct kvm_ioregion)
>   
>   /* enable ucontrol for s390 */
>   struct kvm_s390_ucas_mapping {
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 1c37ccd5d402..5e6620bbf000 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>          bool
>          select EVENTFD
>   
> +config KVM_IOREGION
> +       bool
> +
>   config KVM_MMIO
>          bool
>   
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index c2323c27a28b..aadb73903f8b 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -27,6 +27,7 @@
>   #include <trace/events/kvm.h>
>   
>   #include <kvm/iodev.h>
> +#include "ioregion.h"
>   
>   #ifdef CONFIG_HAVE_KVM_IRQFD
>   
> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops ioeventfd_ops = {
>   	.destructor = ioeventfd_destructor,
>   };
>   
> +#ifdef CONFIG_KVM_IOREGION
> +/* assumes kvm->slots_lock held */
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> +			  u64 start, u64 size)
> +{
> +	struct _ioeventfd *_p;
> +
> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> +		if (_p->bus_idx == bus_idx &&
> +		    overlap(start, size, _p->addr,
> +			    !_p->length ? 8 : _p->length))
> +			return true;
> +
> +	return false;
> +}
> +#endif
> +
>   /* assumes kvm->slots_lock held */
>   static bool
>   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>   		       _p->datamatch == p->datamatch))))
>   			return true;
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> +					  !p->length ? 8 : p->length))
> +			return true;
> +#endif
> +
>   	return false;
>   }
>   
> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> new file mode 100644
> index 000000000000..73a621eebae3
> --- /dev/null
> +++ b/virt/kvm/eventfd.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_EVENTFD_H__
> +#define __KVM_EVENTFD_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> new file mode 100644
> index 000000000000..48ff92bca966
> --- /dev/null
> +++ b/virt/kvm/ioregion.c
> @@ -0,0 +1,232 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kvm_host.h>
> +#include <linux/fs.h>
> +#include <kvm/iodev.h>
> +#include "eventfd.h"
> +
> +void
> +kvm_ioregionfd_init(struct kvm *kvm)
> +{
> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> +}
> +
> +struct ioregion {
> +	struct list_head     list;
> +	u64                  paddr;  /* guest physical address */
> +	u64                  size;   /* size in bytes */
> +	struct file         *rf;
> +	struct file         *wf;
> +	u64                  user_data; /* opaque token used by userspace */
> +	struct kvm_io_device dev;
> +	bool                 posted_writes;
> +};
> +
> +static inline struct ioregion *
> +to_ioregion(struct kvm_io_device *dev)
> +{
> +	return container_of(dev, struct ioregion, dev);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static void
> +ioregion_release(struct ioregion *p)
> +{
> +	fput(p->rf);
> +	fput(p->wf);
> +	list_del(&p->list);
> +	kfree(p);
> +}
> +
> +static int
> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +	      int len, void *val)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int
> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> +		int len, const void *val)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +/*
> + * This function is called as KVM is completely shutting down.  We do not
> + * need to worry about locking just nuke anything we have as quickly as possible
> + */
> +static void
> +ioregion_destructor(struct kvm_io_device *this)
> +{
> +	struct ioregion *p = to_ioregion(this);
> +
> +	ioregion_release(p);
> +}
> +
> +static const struct kvm_io_device_ops ioregion_ops = {
> +	.read       = ioregion_read,
> +	.write      = ioregion_write,
> +	.destructor = ioregion_destructor,
> +};
> +
> +static inline struct list_head *
> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> +{
> +	return (bus_idx == KVM_MMIO_BUS) ?
> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> +}
> +
> +/* check for not overlapping case and reverse */
> +inline bool
> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> +{
> +	u64 end1 = start1 + size1 - 1;
> +	u64 end2 = start2 + size2 - 1;
> +
> +	return !(end1 < start2 || start1 >= end2);
> +}
> +
> +/* assumes kvm->slots_lock held */
> +bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> +		      u64 start, u64 size)
> +{
> +	struct ioregion *_p;
> +	struct list_head *ioregions;
> +
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +	list_for_each_entry(_p, ioregions, list)
> +		if (overlap(start, size, _p->paddr, _p->size))
> +			return true;
> +
> +	return false;
> +}
> +
> +/* assumes kvm->slots_lock held */
> +static bool
> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum kvm_bus bus_idx)
> +{
> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> +		return true;
> +
> +	return false;
> +}
> +
> +static enum kvm_bus
> +get_bus_from_flags(__u32 flags)
> +{
> +	if (flags & KVM_IOREGION_PIO)
> +		return KVM_PIO_BUS;
> +	return KVM_MMIO_BUS;
> +}
> +
> +int
> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion *p;
> +	struct file *rfile, *wfile;
> +	enum kvm_bus bus_idx;
> +	int ret = 0;
> +
> +	if (!args->memory_size)
> +		return -EINVAL;
> +	if ((args->guest_paddr + args->memory_size - 1) < args->guest_paddr)
> +		return -EINVAL;
> +
> +	rfile = fget(args->rfd);
> +	if (!rfile)
> +		return -EBADF;


So the question still, if we want to use ioregion fd for doorbell, we 
don't need rfd in this case?


> +	wfile = fget(args->wfd);
> +	if (!wfile) {
> +		fput(rfile);
> +		return -EBADF;
> +	}
> +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags & O_NONBLOCK)) {
> +		ret = -EINVAL;
> +		goto fail;
> +	}


I wonder how much value if we stick a check like this here (if our code 
can gracefully deal with blocking fd).


> +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	INIT_LIST_HEAD(&p->list);
> +	p->paddr = args->guest_paddr;
> +	p->size = args->memory_size;
> +	p->user_data = args->user_data;
> +	p->rf = rfile;
> +	p->wf = wfile;
> +	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> +	bus_idx = get_bus_from_flags(args->flags);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	if (ioregion_collision(kvm, p, bus_idx)) {
> +		ret = -EEXIST;
> +		goto unlock_fail;
> +	}
> +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> +				      &p->dev);


I think we agree on previous version that we need to deal with FAST_MMIO 
bus here?


> +	if (ret < 0)
> +		goto unlock_fail;
> +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return 0;
> +
> +unlock_fail:
> +	mutex_unlock(&kvm->slots_lock);
> +	kfree(p);
> +fail:
> +	fput(rfile);
> +	fput(wfile);
> +
> +	return ret;
> +}
> +
> +static int
> +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	struct ioregion         *p, *tmp;
> +	enum kvm_bus             bus_idx;
> +	int                      ret = -ENOENT;
> +	struct list_head        *ioregions;
> +
> +	if (args->rfd != -1 || args->wfd != -1)
> +		return -EINVAL;
> +
> +	bus_idx = get_bus_from_flags(args->flags);
> +	ioregions = get_ioregion_list(kvm, bus_idx);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> +		if (p->paddr == args->guest_paddr  &&
> +		    p->size == args->memory_size) {
> +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> +			ioregion_release(p);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +
> +int
> +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> +{
> +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> +		return -EINVAL;
> +	if (args->rfd == -1 || args->wfd == -1)
> +		return kvm_rm_ioregion(kvm, args);
> +
> +	return kvm_set_ioregion(kvm, args);
> +}
> diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> new file mode 100644
> index 000000000000..23ffa812ec7a
> --- /dev/null
> +++ b/virt/kvm/ioregion.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __KVM_IOREGION_H__
> +#define __KVM_IOREGION_H__
> +
> +#ifdef CONFIG_KVM_IOREGION
> +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size);
> +#else
> +static inline bool
> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64 size)
> +{
> +	return false;
> +}
> +#endif
> +#endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2541a17ff1c4..88b92fc3da51 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
>   	mmgrab(current->mm);
>   	kvm->mm = current->mm;
>   	kvm_eventfd_init(kvm);
> +	kvm_ioregionfd_init(kvm);
>   	mutex_init(&kvm->lock);
>   	mutex_init(&kvm->irq_lock);
>   	mutex_init(&kvm->slots_lock);
> @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
>   		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
>   		break;
>   	}
> +	case KVM_SET_IOREGION: {
> +		struct kvm_ioregion data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&data, argp, sizeof(data)))
> +			goto out;
> +
> +		r = kvm_ioregionfd(kvm, &data);
> +		break;
> +	}
>   	case KVM_GET_DIRTY_LOG: {
>   		struct kvm_dirty_log log;
>   


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-05 18:39     ` Elena Afanasova
@ 2021-02-08 11:49       ` Cornelia Huck
  0 siblings, 0 replies; 28+ messages in thread
From: Cornelia Huck @ 2021-02-08 11:49 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: kvm, stefanha, jag.raman, elena.ufimtseva

On Fri, 05 Feb 2021 10:39:33 -0800
Elena Afanasova <eafanasova@gmail.com> wrote:

> On Thu, 2021-02-04 at 14:03 +0100, Cornelia Huck wrote:
> > On Fri, 29 Jan 2021 21:48:26 +0300
> > Elena Afanasova <eafanasova@gmail.com> wrote:

> > > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> > >  					struct
> > > kvm_userspace_memory_region)
> > >  #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> > >  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > > kvm_ioregion)  
> > 
> > This new ioctl needs some documentation under
> > Documentation/virt/kvm/api.rst. (That would also make review easier.)
> >   
> Agreed. The latest version of the ioregionfd API can be found in 
> https://marc.info/?l=kvm&m=160633710708172&w=2. There are still some
> open questions like write coalescing support.  So I think API may still
> be changed during code reviews.

Understood.

> 
> > >  
> > >  /* enable ucontrol for s390 */
> > >  struct kvm_s390_ucas_mapping {  
> > 
> > (...)
> >   
> > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > index c2323c27a28b..aadb73903f8b 100644
> > > --- a/virt/kvm/eventfd.c
> > > +++ b/virt/kvm/eventfd.c
> > > @@ -27,6 +27,7 @@
> > >  #include <trace/events/kvm.h>
> > >  
> > >  #include <kvm/iodev.h>
> > > +#include "ioregion.h"
> > >  
> > >  #ifdef CONFIG_HAVE_KVM_IRQFD
> > >  
> > > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > > ioeventfd_ops = {
> > >  	.destructor = ioeventfd_destructor,
> > >  };
> > >  
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +/* assumes kvm->slots_lock held */
> > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > > +			  u64 start, u64 size)
> > > +{
> > > +	struct _ioeventfd *_p;
> > > +
> > > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > > +		if (_p->bus_idx == bus_idx &&
> > > +		    overlap(start, size, _p->addr,
> > > +			    !_p->length ? 8 : _p->length))  
> > 
> > Not a problem right now, as this is x86 only, but I'm not sure we can
> > define "overlap" in a meaningful way for every bus_idx. (For example,
> > the s390-only ccw notifications use addr to identify a device; as
> > long
> > as addr is unique, there will be no clash. I'm not sure yet if
> > ioregions are usable for ccw devices, and if yes, in which form, but
> > we
> > should probably keep it in mind.)
> >   
> Thank you for pointing it out. Yes, CCW bus seems to be a special case.

In any case, it needs some special care if we want to include it later,
maybe by introducing a bus-specific collision check. As long as we're
just dealing with pio/mmio, I think the function can stay this way.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
  2021-01-30 16:58   ` Stefan Hajnoczi
@ 2021-02-09  6:21   ` Jason Wang
  2021-02-09 14:49     ` Stefan Hajnoczi
  2021-02-10 19:06     ` Elena Afanasova
  2021-02-09  6:26   ` Jason Wang
  2 siblings, 2 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-09  6:21 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/29 上午2:32, Elena Afanasova wrote:
> The vCPU thread may receive a signal during ioregionfd communication,
> ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> must resume ioregionfd.


It looks to me the patch contains much more than just signal handling 
(e.g the protocol). Please split.


>
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>    - add support for x86 signal handling
>    - changes after code review
>
>   arch/x86/kvm/x86.c            | 196 +++++++++++++++++++++++++++++++---
>   include/linux/kvm_host.h      |  13 +++
>   include/uapi/linux/ioregion.h |  32 ++++++
>   virt/kvm/ioregion.c           | 177 +++++++++++++++++++++++++++++-
>   virt/kvm/kvm_main.c           |  16 ++-
>   5 files changed, 415 insertions(+), 19 deletions(-)
>   create mode 100644 include/uapi/linux/ioregion.h


I wonder whether it's better to split into two patches:

1) general signal support for KVM I/O device
2) the ioregionfd part


>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ddb28f5ca252..a04516b531da 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>   {
>   	int handled = 0;
>   	int n;
> +	int ret = 0;
> +	bool is_apic;
>   
>   	do {
>   		n = min(len, 8);
> -		if (!(lapic_in_kernel(vcpu) &&
> -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
> -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
> -			break;
> +		is_apic = lapic_in_kernel(vcpu) &&
> +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> +					      addr, n, v);


A better name is needed since "is_apic" only covers the first condition.


> +		if (!is_apic) {
> +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> +					       addr, n, v);
> +			if (ret)
> +				break;
> +		}
>   		handled += n;
>   		addr += n;
>   		len -= n;
>   		v += n;
>   	} while (len);
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;


My understanding is that, we should check ERESTARTSYS instead of EINTR. 
EINTR means the syscall can't be restarted.

E.g we had the following errno for sockets:

/* Alas, with timeout socket operations are not restartable.
  * Compare this to poll().
  */
static inline int sock_intr_errno(long timeo)
{
     return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

For the case of EINTR, do we need a fallback to vcpu userspace process 
(Qemu)?

And we probably need a trace point here.


> +	}
> +#endif
> +
>   	return handled;
>   }
>   
> @@ -5819,14 +5833,20 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
>   {
>   	int handled = 0;
>   	int n;
> +	int ret = 0;
> +	bool is_apic;
>   
>   	do {
>   		n = min(len, 8);
> -		if (!(lapic_in_kernel(vcpu) &&
> -		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> -					 addr, n, v))
> -		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
> -			break;
> +		is_apic = lapic_in_kernel(vcpu) &&
> +			  !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> +					     addr, n, v);
> +		if (!is_apic) {
> +			ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS,
> +					      addr, n, v);
> +			if (ret)
> +				break;
> +		}
>   		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
>   		handled += n;
>   		addr += n;
> @@ -5834,6 +5854,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
>   		v += n;
>   	} while (len);
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +	}
> +#endif
> +
>   	return handled;
>   }
>   
> @@ -6294,6 +6321,12 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
>   	vcpu->mmio_needed = 1;
>   	vcpu->mmio_cur_fragment = 0;
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted &&
> +	    vcpu->run->exit_reason == KVM_EXIT_INTR)
> +		return (vcpu->ioregion_ctx.in) ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
> +#endif
> +
>   	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
>   	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
>   	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> @@ -6411,16 +6444,23 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
>   
>   	for (i = 0; i < vcpu->arch.pio.count; i++) {
>   		if (vcpu->arch.pio.in)
> -			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
> +			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
> +					    vcpu->arch.pio.port,
>   					    vcpu->arch.pio.size, pd);
>   		else
>   			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
> -					     vcpu->arch.pio.port, vcpu->arch.pio.size,
> -					     pd);
> +					     vcpu->arch.pio.port,
> +					     vcpu->arch.pio.size, pd);
>   		if (r)
>   			break;
>   		pd += vcpu->arch.pio.size;
>   	}
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted && r == -EINTR) {
> +		vcpu->ioregion_ctx.pio = i;
> +	}
> +#endif
> +
>   	return r;
>   }
>   
> @@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
>   			       unsigned short port, void *val,
>   			       unsigned int count, bool in)
>   {
> +	int ret = 0;
> +
>   	vcpu->arch.pio.port = port;
>   	vcpu->arch.pio.in = in;
>   	vcpu->arch.pio.count  = count;
>   	vcpu->arch.pio.size = size;
>   
> -	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
> +	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
> +	if (!ret) {


Unnecessary changes.


>   		vcpu->arch.pio.count = 0;
>   		return 1;
>   	}
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +		return 0;
> +	}
> +#endif
> +
>   	vcpu->run->exit_reason = KVM_EXIT_IO;
>   	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
>   	vcpu->run->io.size = size;
> @@ -7141,6 +7192,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
>   
>   static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
>   static int complete_emulated_pio(struct kvm_vcpu *vcpu);
> +#ifdef CONFIG_KVM_IOREGION
> +static int complete_ioregion_io(struct kvm_vcpu *vcpu);
> +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu);
> +#endif
>   
>   static void kvm_smm_changed(struct kvm_vcpu *vcpu)
>   {
> @@ -7405,6 +7460,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   		r = 1;
>   		if (inject_emulated_exception(vcpu))
>   			return r;
> +#ifdef CONFIG_KVM_IOREGION
> +	} else if (vcpu->ioregion_interrupted &&
> +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		if (vcpu->ioregion_ctx.in)
> +			writeback = false;
> +		vcpu->arch.complete_userspace_io = complete_ioregion_io;
> +		r = 0;
> +#endif
>   	} else if (vcpu->arch.pio.count) {
>   		if (!vcpu->arch.pio.in) {
>   			/* FIXME: return into emulator if single-stepping.  */
> @@ -7501,6 +7564,11 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
>   		vcpu->arch.complete_userspace_io =
>   			complete_fast_pio_out_port_0x7e;
>   		kvm_skip_emulated_instruction(vcpu);
> +#ifdef CONFIG_KVM_IOREGION
> +	} else if (vcpu->ioregion_interrupted &&
> +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
> +#endif
>   	} else {
>   		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
>   		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
> @@ -7548,6 +7616,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
>   		return ret;
>   	}
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted &&
> +	    vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
> +		return 0;
> +	}
> +#endif
>   	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
>   	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
>   
> @@ -9204,6 +9279,101 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +#ifdef CONFIG_KVM_IOREGION
> +static void complete_ioregion_access(struct kvm_vcpu *vcpu, gpa_t addr,
> +				     int len, void *val)
> +{
> +	if (vcpu->ioregion_ctx.in)
> +		vcpu->ioregion_ctx.dev->ops->read(vcpu, vcpu->ioregion_ctx.dev,
> +						  addr, len, val);
> +	else
> +		vcpu->ioregion_ctx.dev->ops->write(vcpu, vcpu->ioregion_ctx.dev,
> +						   addr, len, val);


Two dumb questions:

1) So if the write is interrupted by the signal, we may do twice or more 
write. Can this satisfies the semantics of all type of registers? E.g 
for the hardware that counts the time of write to a specific register etc.
2) If the answer is yes, can we simply rewind RIP to re-emulate the 
instruction?


> +}
> +
> +static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmio_fragment *frag;
> +	int idx, ret, i, n;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	for (i = vcpu->mmio_cur_fragment; i < vcpu->mmio_nr_fragments; i++) {
> +		frag = &vcpu->mmio_fragments[i];
> +		do {
> +			n = min(8u, frag->len);
> +			complete_ioregion_access(vcpu, frag->gpa, n, frag->data);
> +			frag->len -= n;
> +			frag->data += n;
> +			frag->gpa += n;
> +		} while (frag->len);
> +		vcpu->mmio_cur_fragment++;
> +	}
> +
> +	vcpu->mmio_needed = 0;
> +	if (!vcpu->ioregion_ctx.in) {
> +		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +		return 1;
> +	}
> +
> +	vcpu->mmio_read_completed = 1;
> +	ret = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	return ret;
> +}
> +
> +static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
> +{
> +	int i, idx, r = 1;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	for (i = vcpu->ioregion_ctx.pio; i < vcpu->arch.pio.count; i++) {
> +		complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> +					 vcpu->ioregion_ctx.len,
> +					 vcpu->ioregion_ctx.val);
> +		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> +	}
> +
> +	if (vcpu->ioregion_ctx.in)
> +		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	vcpu->arch.pio.count = 0;
> +
> +	return r;
> +}
> +
> +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
> +{
> +	int idx;
> +	u64 val;
> +
> +	BUG_ON(!vcpu->ioregion_interrupted);
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> +				 vcpu->ioregion_ctx.len,
> +				 vcpu->ioregion_ctx.val);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (vcpu->ioregion_ctx.in) {
> +		memcpy(&val, vcpu->ioregion_ctx.val, vcpu->ioregion_ctx.len);
> +		kvm_rax_write(vcpu, val);
> +	}
> +	vcpu->arch.pio.count = 0;
> +
> +	return kvm_skip_emulated_instruction(vcpu);
> +}
> +
> +static int complete_ioregion_io(struct kvm_vcpu *vcpu)
> +{
> +	BUG_ON(!vcpu->ioregion_interrupted);
> +
> +	if (vcpu->mmio_needed)
> +		return complete_ioregion_mmio(vcpu);
> +	if (vcpu->arch.pio.count)
> +		return complete_ioregion_pio(vcpu);
> +}
> +#endif
> +
>   static void kvm_save_current_fpu(struct fpu *fpu)
>   {
>   	/*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7cd667dddba9..5cfdecfca6db 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -318,6 +318,19 @@ struct kvm_vcpu {
>   #endif
>   	bool preempted;
>   	bool ready;
> +#ifdef CONFIG_KVM_IOREGION
> +	bool ioregion_interrupted;
> +	struct {
> +		struct kvm_io_device *dev;
> +		int pio;
> +		void *val;
> +		u8 state;


Let's document the state machine here.


> +		u64 addr;
> +		int len;
> +		u64 data;
> +		bool in;
> +	} ioregion_ctx;
> +#endif
>   	struct kvm_vcpu_arch arch;
>   };
>   
> diff --git a/include/uapi/linux/ioregion.h b/include/uapi/linux/ioregion.h
> new file mode 100644
> index 000000000000..7898c01f84a1
> --- /dev/null
> +++ b/include/uapi/linux/ioregion.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_IOREGION_H
> +#define _UAPI_LINUX_IOREGION_H
> +
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +
> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};
> +
> +#define IOREGIONFD_CMD_READ    0
> +#define IOREGIONFD_CMD_WRITE   1
> +
> +#define IOREGIONFD_SIZE_8BIT   0
> +#define IOREGIONFD_SIZE_16BIT  1
> +#define IOREGIONFD_SIZE_32BIT  2
> +#define IOREGIONFD_SIZE_64BIT  3
> +
> +#define IOREGIONFD_SIZE_OFFSET 4
> +#define IOREGIONFD_RESP_OFFSET 6
> +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)


Instead of using macros, why not explicitly define them in struct 
ioregionfd_cmd instead of using info?


> +
> +#endif
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index 48ff92bca966..da38124e1418 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -3,6 +3,7 @@
>   #include <linux/fs.h>
>   #include <kvm/iodev.h>
>   #include "eventfd.h"
> +#include <uapi/linux/ioregion.h>
>   
>   void
>   kvm_ioregionfd_init(struct kvm *kvm)
> @@ -38,18 +39,190 @@ ioregion_release(struct ioregion *p)
>   	kfree(p);
>   }
>   
> +static bool
> +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, int resp,
> +	 u64 user_data, const void *val)
> +{
> +	u64 size = 0;
> +
> +	switch (len) {
> +	case 1:
> +		size = IOREGIONFD_SIZE_8BIT;
> +		break;
> +	case 2:
> +		size = IOREGIONFD_SIZE_16BIT;
> +		break;
> +	case 4:
> +		size = IOREGIONFD_SIZE_32BIT;
> +		break;
> +	case 8:
> +		size = IOREGIONFD_SIZE_64BIT;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	if (val)
> +		memcpy(&cmd->data, val, len);
> +	cmd->user_data = user_data;
> +	cmd->offset = offset;
> +	cmd->info |= opt;
> +	cmd->info |= IOREGIONFD_SIZE(size);
> +	cmd->info |= IOREGIONFD_RESP(resp);
> +
> +	return true;
> +}
> +
> +enum {
> +	SEND_CMD,
> +	GET_REPLY,
> +	COMPLETE
> +};
> +
> +static void
> +ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> +		  bool in, gpa_t addr, int len, u64 data, u8 state, void *val)
> +{
> +	vcpu->ioregion_interrupted = true;
> +
> +	vcpu->ioregion_ctx.dev = this;
> +	vcpu->ioregion_ctx.val = val;
> +	vcpu->ioregion_ctx.state = state;
> +	vcpu->ioregion_ctx.addr = addr;
> +	vcpu->ioregion_ctx.len = len;
> +	vcpu->ioregion_ctx.data = data;
> +	vcpu->ioregion_ctx.in = in;
> +}
> +
>   static int
>   ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>   	      int len, void *val)
>   {
> -	return -EOPNOTSUPP;
> +	struct ioregion *p = to_ioregion(this);
> +	union {
> +		struct ioregionfd_cmd cmd;
> +		struct ioregionfd_resp resp;
> +	} buf;
> +	int ret = 0;
> +	int state = 0;


Let's use SEND_CMD otherwise it would be hard for the reviewers...


> +
> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	if (unlikely(vcpu->ioregion_interrupted)) {
> +		vcpu->ioregion_interrupted = false;
> +
> +		switch (vcpu->ioregion_ctx.state) {
> +		case SEND_CMD:
> +			goto send_cmd;
> +		case GET_REPLY:
> +			goto get_repl;
> +		case COMPLETE:


I fail to understand under what condition can we reach here?


> +			memcpy(val, &vcpu->ioregion_ctx.data, len);
> +			return 0;
> +		}
> +	}
> +
> +send_cmd:
> +	memset(&buf, 0, sizeof(buf));
> +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
> +		      1, p->user_data, NULL))
> +		return -EOPNOTSUPP;
> +
> +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> +	state = (ret == sizeof(buf.cmd));
> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 1, addr, len, 0, state, val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.cmd)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +get_repl:
> +	memset(&buf, 0, sizeof(buf));
> +	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> +	state += (ret == sizeof(buf.resp));


Let's use enum instead of doing tricks like this.

Thanks


> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 1, addr, len, buf.resp.data, state, val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.resp)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +	memcpy(val, &buf.resp.data, len);
> +
> +	return 0;
>   }
>   
>   static int
>   ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>   		int len, const void *val)
>   {
> -	return -EOPNOTSUPP;
> +	struct ioregion *p = to_ioregion(this);
> +	union {
> +		struct ioregionfd_cmd cmd;
> +		struct ioregionfd_resp resp;
> +	} buf;
> +	int ret = 0;
> +	int state = 0;
> +
> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	if (unlikely(vcpu->ioregion_interrupted)) {
> +		vcpu->ioregion_interrupted = false;
> +
> +		switch (vcpu->ioregion_ctx.state) {
> +		case SEND_CMD:
> +			goto send_cmd;
> +		case GET_REPLY:
> +			if (!p->posted_writes)
> +				goto get_repl;
> +			fallthrough;
> +		case COMPLETE:
> +			return 0;
> +		}
> +	}
> +
> +send_cmd:
> +	memset(&buf, 0, sizeof(buf));
> +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE,
> +		      p->posted_writes ? 0 : 1, p->user_data, val))
> +		return -EOPNOTSUPP;
> +
> +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> +	state = (ret == sizeof(buf.cmd));
> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 0, addr, len,
> +				  0, state, (void *)val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.cmd)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +get_repl:
> +	if (!p->posted_writes) {
> +		memset(&buf, 0, sizeof(buf));
> +		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> +		state += (ret == sizeof(buf.resp));
> +		if (signal_pending(current)) {
> +			ioregion_save_ctx(vcpu, this, 0, addr, len,
> +					  0, state, (void *)val);
> +			return -EINTR;
> +		}
> +		if (ret != sizeof(buf.resp)) {
> +			ret = (ret < 0) ? ret : -EIO;
> +			return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +		}
> +	}
> +
> +	return 0;
>   }
>   
>   /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 88b92fc3da51..df387857f51f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   			      struct kvm_io_range *range, const void *val)
>   {
>   	int idx;
> +	int ret = 0;
>   
>   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
>   	if (idx < 0)
> @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   
>   	while (idx < bus->dev_count &&
>   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> -					range->len, val))
> +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> +					 range->len, val);
> +		if (!ret)
>   			return idx;
> +		if (ret < 0 && ret != -EOPNOTSUPP)
> +			return ret;
>   		idx++;
>   	}
>   
> @@ -4264,6 +4268,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   			     struct kvm_io_range *range, void *val)
>   {
>   	int idx;
> +	int ret = 0;
>   
>   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
>   	if (idx < 0)
> @@ -4271,9 +4276,12 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   
>   	while (idx < bus->dev_count &&
>   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> -		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
> -				       range->len, val))
> +		ret = kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
> +					range->len, val);
> +		if (!ret)
>   			return idx;
> +		if (ret < 0 && ret != -EOPNOTSUPP)
> +			return ret;
>   		idx++;
>   	}
>   


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
  2021-01-30 16:58   ` Stefan Hajnoczi
  2021-02-09  6:21   ` Jason Wang
@ 2021-02-09  6:26   ` Jason Wang
  2 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-09  6:26 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/1/29 上午2:32, Elena Afanasova wrote:
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};


So I'm still don't understand how the kernel and userspace is 
synchonrized when the fd is being used simultaneously by multiple 
devices/regions.

It might be helpful to document the protocol in api.rst.

Thanks


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-02-09  6:21   ` Jason Wang
@ 2021-02-09 14:49     ` Stefan Hajnoczi
  2021-02-10 19:06     ` Elena Afanasova
  1 sibling, 0 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-02-09 14:49 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 4421 bytes --]

On Tue, Feb 09, 2021 at 02:21:22PM +0800, Jason Wang wrote:
> On 2021/1/29 上午2:32, Elena Afanasova wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index ddb28f5ca252..a04516b531da 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> >   {
> >   	int handled = 0;
> >   	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >   	do {
> >   		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
> > -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> > +					      addr, n, v);
> 
> 
> A better name is needed since "is_apic" only covers the first condition.

The kvm_iodevice_write() call is specific to vcpu->arch.apic->dev (the
in-kernel APIC device). It's not a generic kvm_io_bus_write() call, so
"is_apic" seems correct to me.

> > @@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
> >   			       unsigned short port, void *val,
> >   			       unsigned int count, bool in)
> >   {
> > +	int ret = 0;
> > +
> >   	vcpu->arch.pio.port = port;
> >   	vcpu->arch.pio.in = in;
> >   	vcpu->arch.pio.count  = count;
> >   	vcpu->arch.pio.size = size;
> > -	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
> > +	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
> > +	if (!ret) {
> 
> 
> Unnecessary changes.
[...]
> >   		vcpu->arch.pio.count = 0;
> >   		return 1;
> >   	}
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +		return 0;
> > +	}
> > +#endif

ret is used here. The change above looks necessary to me.

> > diff --git a/include/uapi/linux/ioregion.h b/include/uapi/linux/ioregion.h
> > new file mode 100644
> > index 000000000000..7898c01f84a1
> > --- /dev/null
> > +++ b/include/uapi/linux/ioregion.h
> > @@ -0,0 +1,32 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_IOREGION_H
> > +#define _UAPI_LINUX_IOREGION_H
> > +
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};
> > +
> > +#define IOREGIONFD_CMD_READ    0
> > +#define IOREGIONFD_CMD_WRITE   1
> > +
> > +#define IOREGIONFD_SIZE_8BIT   0
> > +#define IOREGIONFD_SIZE_16BIT  1
> > +#define IOREGIONFD_SIZE_32BIT  2
> > +#define IOREGIONFD_SIZE_64BIT  3
> > +
> > +#define IOREGIONFD_SIZE_OFFSET 4
> > +#define IOREGIONFD_RESP_OFFSET 6
> > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
> 
> 
> Instead of using macros, why not explicitly define them in struct
> ioregionfd_cmd instead of using info?

Good idea, these macros are a little confusing. They produce the info
field value but when reading the code quickly one might think they parse
it instead.

I would go all the way and use a union type:

  struct ioregionfd_cmd {
      __u8 cmd;
      union {
          /* IOREGIONFD_CMD_READ */
          struct {
              __u8 size_exponent : 4;
	      __u8 padding[6];
	      __u64 user_data;
	      __u64 offset;
	  } read;

          /* IOREGIONFD_CMD_WRITE */
          struct {
              __u8 size_exponent : 4;
	      __u8 padding[6];
	      __u64 user_data;
	      __u64 offset;
	      __u64 data;
	  } write;

	  __u8 padding[31];
      };
  };

That way we're not restricted to putting data into a single set of
struct fields for all commands. New commands can easily be added with
totally different fields.

(I didn't check whether the syntax above results in the desired memory
layout, buit you can check with pahole or similar tools.)

(Also, I checked that C bit-fields can be used in Linux uapi headers.
Although their representation is implementation-defined according to the
C standard there is a well-defined representation in the System V ABI
that gcc, clang, and other compilers follow on Linux.)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-08  6:21   ` Jason Wang
@ 2021-02-09 14:59     ` Stefan Hajnoczi
  2021-02-18  6:17       ` Jason Wang
  2021-02-10 19:31     ` Elena Afanasova
  1 sibling, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-02-09 14:59 UTC (permalink / raw)
  To: Jason Wang; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 1430 bytes --]

On Mon, Feb 08, 2021 at 02:21:35PM +0800, Jason Wang wrote:
> On 2021/1/30 上午2:48, Elena Afanasova wrote:
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index ca41220b40b8..81e775778c66 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> >   	__u8  pad[36];
> >   };
> > +enum {
> > +	kvm_ioregion_flag_nr_pio,
> > +	kvm_ioregion_flag_nr_posted_writes,
> > +	kvm_ioregion_flag_nr_max,
> > +};
> > +
> > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
> > +
> > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
> > +
> > +struct kvm_ioregion {
> > +	__u64 guest_paddr; /* guest physical address */
> > +	__u64 memory_size; /* bytes */
> 
> 
> Do we really need __u64 here?

I think 64-bit PCI BARs can be >4 GB. There is plenty of space in this
struct to support a 64-bit field.

That said, userspace could also add more ioregions if it needs to cover
more than 4 GB. That would slow down ioregion lookups though since the
in-kernel data structure would become larger.

Making it 64-bit seems more future-proof and cleaner than having to work
around the limitation using multiple ioregions. Did you have a
particular reason in mind why this field should not be 64 bits?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
  2021-02-09  6:21   ` Jason Wang
  2021-02-09 14:49     ` Stefan Hajnoczi
@ 2021-02-10 19:06     ` Elena Afanasova
  1 sibling, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-02-10 19:06 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Tue, 2021-02-09 at 14:21 +0800, Jason Wang wrote:
> On 2021/1/29 上午2:32, Elena Afanasova wrote:
> > The vCPU thread may receive a signal during ioregionfd
> > communication,
> > ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> > must resume ioregionfd.
> 
> It looks to me the patch contains much more than just signal
> handling 
> (e.g the protocol). Please split.
> 
Ok

> 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> > Changes in v2:
> >    - add support for x86 signal handling
> >    - changes after code review
> > 
> >   arch/x86/kvm/x86.c            | 196
> > +++++++++++++++++++++++++++++++---
> >   include/linux/kvm_host.h      |  13 +++
> >   include/uapi/linux/ioregion.h |  32 ++++++
> >   virt/kvm/ioregion.c           | 177
> > +++++++++++++++++++++++++++++-
> >   virt/kvm/kvm_main.c           |  16 ++-
> >   5 files changed, 415 insertions(+), 19 deletions(-)
> >   create mode 100644 include/uapi/linux/ioregion.h
> 
> I wonder whether it's better to split into two patches:
> 
> 1) general signal support for KVM I/O device
> 2) the ioregionfd part
> 
> 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index ddb28f5ca252..a04516b531da 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >   {
> >   	int handled = 0;
> >   	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >   
> >   	do {
> >   		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> > addr, n, v))
> > -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n,
> > v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic-
> > >dev,
> > +					      addr, n, v);
> 
> A better name is needed since "is_apic" only covers the first
> condition.
> 
> 
> > +		if (!is_apic) {
> > +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> > +					       addr, n, v);
> > +			if (ret)
> > +				break;
> > +		}
> >   		handled += n;
> >   		addr += n;
> >   		len -= n;
> >   		v += n;
> >   	} while (len);
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> 
> My understanding is that, we should check ERESTARTSYS instead of
> EINTR. 
> EINTR means the syscall can't be restarted.
> 
I think the case when ioregionfd communication is interrupted can be
seen as interrupted ioctl(KVM_RUN).

> E.g we had the following errno for sockets:
> 
> /* Alas, with timeout socket operations are not restartable.
>   * Compare this to poll().
>   */
> static inline int sock_intr_errno(long timeo)
> {
>      return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
> }
> 
> For the case of EINTR, do we need a fallback to vcpu userspace
> process 
> (Qemu)?
> 
Yes, ioctl(KVM_RUN) needs to return to userspace.

> And we probably need a trace point here.
> 
> 
> > +	}
> > +#endif
> > +
> >   	return handled;
> >   }
> >   
> > @@ -5819,14 +5833,20 @@ static int vcpu_mmio_read(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len, void *v)
> >   {
> >   	int handled = 0;
> >   	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >   
> >   	do {
> >   		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> > -					 addr, n, v))
> > -		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_read(vcpu, &vcpu->arch.apic-
> > >dev,
> > +					     addr, n, v);
> > +		if (!is_apic) {
> > +			ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS,
> > +					      addr, n, v);
> > +			if (ret)
> > +				break;
> > +		}
> >   		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
> >   		handled += n;
> >   		addr += n;
> > @@ -5834,6 +5854,13 @@ static int vcpu_mmio_read(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len, void *v)
> >   		v += n;
> >   	} while (len);
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +	}
> > +#endif
> > +
> >   	return handled;
> >   }
> >   
> > @@ -6294,6 +6321,12 @@ static int emulator_read_write(struct
> > x86_emulate_ctxt *ctxt,
> >   	vcpu->mmio_needed = 1;
> >   	vcpu->mmio_cur_fragment = 0;
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted &&
> > +	    vcpu->run->exit_reason == KVM_EXIT_INTR)
> > +		return (vcpu->ioregion_ctx.in) ? X86EMUL_IO_NEEDED :
> > X86EMUL_CONTINUE;
> > +#endif
> > +
> >   	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
> >   	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
> >   	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > @@ -6411,16 +6444,23 @@ static int kernel_pio(struct kvm_vcpu
> > *vcpu, void *pd)
> >   
> >   	for (i = 0; i < vcpu->arch.pio.count; i++) {
> >   		if (vcpu->arch.pio.in)
> > -			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu-
> > >arch.pio.port,
> > +			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
> > +					    vcpu->arch.pio.port,
> >   					    vcpu->arch.pio.size, pd);
> >   		else
> >   			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
> > -					     vcpu->arch.pio.port, vcpu-
> > >arch.pio.size,
> > -					     pd);
> > +					     vcpu->arch.pio.port,
> > +					     vcpu->arch.pio.size, pd);
> >   		if (r)
> >   			break;
> >   		pd += vcpu->arch.pio.size;
> >   	}
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted && r == -EINTR) {
> > +		vcpu->ioregion_ctx.pio = i;
> > +	}
> > +#endif
> > +
> >   	return r;
> >   }
> >   
> > @@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct
> > kvm_vcpu *vcpu, int size,
> >   			       unsigned short port, void *val,
> >   			       unsigned int count, bool in)
> >   {
> > +	int ret = 0;
> > +
> >   	vcpu->arch.pio.port = port;
> >   	vcpu->arch.pio.in = in;
> >   	vcpu->arch.pio.count  = count;
> >   	vcpu->arch.pio.size = size;
> >   
> > -	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
> > +	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
> > +	if (!ret) {
> 
> Unnecessary changes.
> 
> 
> >   		vcpu->arch.pio.count = 0;
> >   		return 1;
> >   	}
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +		return 0;
> > +	}
> > +#endif
> > +
> >   	vcpu->run->exit_reason = KVM_EXIT_IO;
> >   	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN :
> > KVM_EXIT_IO_OUT;
> >   	vcpu->run->io.size = size;
> > @@ -7141,6 +7192,10 @@ static bool retry_instruction(struct
> > x86_emulate_ctxt *ctxt,
> >   
> >   static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
> >   static int complete_emulated_pio(struct kvm_vcpu *vcpu);
> > +#ifdef CONFIG_KVM_IOREGION
> > +static int complete_ioregion_io(struct kvm_vcpu *vcpu);
> > +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu);
> > +#endif
> >   
> >   static void kvm_smm_changed(struct kvm_vcpu *vcpu)
> >   {
> > @@ -7405,6 +7460,14 @@ int x86_emulate_instruction(struct kvm_vcpu
> > *vcpu, gpa_t cr2_or_gpa,
> >   		r = 1;
> >   		if (inject_emulated_exception(vcpu))
> >   			return r;
> > +#ifdef CONFIG_KVM_IOREGION
> > +	} else if (vcpu->ioregion_interrupted &&
> > +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		if (vcpu->ioregion_ctx.in)
> > +			writeback = false;
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_io;
> > +		r = 0;
> > +#endif
> >   	} else if (vcpu->arch.pio.count) {
> >   		if (!vcpu->arch.pio.in) {
> >   			/* FIXME: return into emulator if single-
> > stepping.  */
> > @@ -7501,6 +7564,11 @@ static int kvm_fast_pio_out(struct kvm_vcpu
> > *vcpu, int size,
> >   		vcpu->arch.complete_userspace_io =
> >   			complete_fast_pio_out_port_0x7e;
> >   		kvm_skip_emulated_instruction(vcpu);
> > +#ifdef CONFIG_KVM_IOREGION
> > +	} else if (vcpu->ioregion_interrupted &&
> > +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_fast_pio;
> > +#endif
> >   	} else {
> >   		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
> >   		vcpu->arch.complete_userspace_io =
> > complete_fast_pio_out;
> > @@ -7548,6 +7616,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu
> > *vcpu, int size,
> >   		return ret;
> >   	}
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted &&
> > +	    vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_fast_pio;
> > +		return 0;
> > +	}
> > +#endif
> >   	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
> >   	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
> >   
> > @@ -9204,6 +9279,101 @@ static int complete_emulated_mmio(struct
> > kvm_vcpu *vcpu)
> >   	return 0;
> >   }
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +static void complete_ioregion_access(struct kvm_vcpu *vcpu, gpa_t
> > addr,
> > +				     int len, void *val)
> > +{
> > +	if (vcpu->ioregion_ctx.in)
> > +		vcpu->ioregion_ctx.dev->ops->read(vcpu, vcpu-
> > >ioregion_ctx.dev,
> > +						  addr, len, val);
> > +	else
> > +		vcpu->ioregion_ctx.dev->ops->write(vcpu, vcpu-
> > >ioregion_ctx.dev,
> > +						   addr, len, val);
> 
> Two dumb questions:
> 
> 1) So if the write is interrupted by the signal, we may do twice or
> more 
> write. Can this satisfies the semantics of all type of registers? 

I think it's necessary to call kvm_io_bus_{read, write}() here. If
there is no in-kernel device or ioregion gets deleted KVM needs to
return to userspace with KVM_EXIT_MMIO/KVM_EXIT_IO.

> E.g 
> for the hardware that counts the time of write to a specific register
> etc.
> 2) If the answer is yes, can we simply rewind RIP to re-emulate the 
> instruction?
> 
> 
> > +}
> > +
> > +static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_mmio_fragment *frag;
> > +	int idx, ret, i, n;
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	for (i = vcpu->mmio_cur_fragment; i < vcpu->mmio_nr_fragments;
> > i++) {
> > +		frag = &vcpu->mmio_fragments[i];
> > +		do {
> > +			n = min(8u, frag->len);
> > +			complete_ioregion_access(vcpu, frag->gpa, n,
> > frag->data);
> > +			frag->len -= n;
> > +			frag->data += n;
> > +			frag->gpa += n;
> > +		} while (frag->len);
> > +		vcpu->mmio_cur_fragment++;
> > +	}
> > +
> > +	vcpu->mmio_needed = 0;
> > +	if (!vcpu->ioregion_ctx.in) {
> > +		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +		return 1;
> > +	}
> > +
> > +	vcpu->mmio_read_completed = 1;
> > +	ret = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +	return ret;
> > +}
> > +
> > +static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
> > +{
> > +	int i, idx, r = 1;
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	for (i = vcpu->ioregion_ctx.pio; i < vcpu->arch.pio.count; i++)
> > {
> > +		complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> > +					 vcpu->ioregion_ctx.len,
> > +					 vcpu->ioregion_ctx.val);
> > +		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> > +	}
> > +
> > +	if (vcpu->ioregion_ctx.in)
> > +		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +	vcpu->arch.pio.count = 0;
> > +
> > +	return r;
> > +}
> > +
> > +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
> > +{
> > +	int idx;
> > +	u64 val;
> > +
> > +	BUG_ON(!vcpu->ioregion_interrupted);
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> > +				 vcpu->ioregion_ctx.len,
> > +				 vcpu->ioregion_ctx.val);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +
> > +	if (vcpu->ioregion_ctx.in) {
> > +		memcpy(&val, vcpu->ioregion_ctx.val, vcpu-
> > >ioregion_ctx.len);
> > +		kvm_rax_write(vcpu, val);
> > +	}
> > +	vcpu->arch.pio.count = 0;
> > +
> > +	return kvm_skip_emulated_instruction(vcpu);
> > +}
> > +
> > +static int complete_ioregion_io(struct kvm_vcpu *vcpu)
> > +{
> > +	BUG_ON(!vcpu->ioregion_interrupted);
> > +
> > +	if (vcpu->mmio_needed)
> > +		return complete_ioregion_mmio(vcpu);
> > +	if (vcpu->arch.pio.count)
> > +		return complete_ioregion_pio(vcpu);
> > +}
> > +#endif
> > +
> >   static void kvm_save_current_fpu(struct fpu *fpu)
> >   {
> >   	/*
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7cd667dddba9..5cfdecfca6db 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -318,6 +318,19 @@ struct kvm_vcpu {
> >   #endif
> >   	bool preempted;
> >   	bool ready;
> > +#ifdef CONFIG_KVM_IOREGION
> > +	bool ioregion_interrupted;
> > +	struct {
> > +		struct kvm_io_device *dev;
> > +		int pio;
> > +		void *val;
> > +		u8 state;
> 
> Let's document the state machine here.
> 
> 
> > +		u64 addr;
> > +		int len;
> > +		u64 data;
> > +		bool in;
> > +	} ioregion_ctx;
> > +#endif
> >   	struct kvm_vcpu_arch arch;
> >   };
> >   
> > diff --git a/include/uapi/linux/ioregion.h
> > b/include/uapi/linux/ioregion.h
> > new file mode 100644
> > index 000000000000..7898c01f84a1
> > --- /dev/null
> > +++ b/include/uapi/linux/ioregion.h
> > @@ -0,0 +1,32 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_IOREGION_H
> > +#define _UAPI_LINUX_IOREGION_H
> > +
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};
> > +
> > +#define IOREGIONFD_CMD_READ    0
> > +#define IOREGIONFD_CMD_WRITE   1
> > +
> > +#define IOREGIONFD_SIZE_8BIT   0
> > +#define IOREGIONFD_SIZE_16BIT  1
> > +#define IOREGIONFD_SIZE_32BIT  2
> > +#define IOREGIONFD_SIZE_64BIT  3
> > +
> > +#define IOREGIONFD_SIZE_OFFSET 4
> > +#define IOREGIONFD_RESP_OFFSET 6
> > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
> 
> Instead of using macros, why not explicitly define them in struct 
> ioregionfd_cmd instead of using info?
> 
> 
> > +
> > +#endif
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index 48ff92bca966..da38124e1418 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -3,6 +3,7 @@
> >   #include <linux/fs.h>
> >   #include <kvm/iodev.h>
> >   #include "eventfd.h"
> > +#include <uapi/linux/ioregion.h>
> >   
> >   void
> >   kvm_ioregionfd_init(struct kvm *kvm)
> > @@ -38,18 +39,190 @@ ioregion_release(struct ioregion *p)
> >   	kfree(p);
> >   }
> >   
> > +static bool
> > +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt,
> > int resp,
> > +	 u64 user_data, const void *val)
> > +{
> > +	u64 size = 0;
> > +
> > +	switch (len) {
> > +	case 1:
> > +		size = IOREGIONFD_SIZE_8BIT;
> > +		break;
> > +	case 2:
> > +		size = IOREGIONFD_SIZE_16BIT;
> > +		break;
> > +	case 4:
> > +		size = IOREGIONFD_SIZE_32BIT;
> > +		break;
> > +	case 8:
> > +		size = IOREGIONFD_SIZE_64BIT;
> > +		break;
> > +	default:
> > +		return false;
> > +	}
> > +
> > +	if (val)
> > +		memcpy(&cmd->data, val, len);
> > +	cmd->user_data = user_data;
> > +	cmd->offset = offset;
> > +	cmd->info |= opt;
> > +	cmd->info |= IOREGIONFD_SIZE(size);
> > +	cmd->info |= IOREGIONFD_RESP(resp);
> > +
> > +	return true;
> > +}
> > +
> > +enum {
> > +	SEND_CMD,
> > +	GET_REPLY,
> > +	COMPLETE
> > +};
> > +
> > +static void
> > +ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device
> > *this,
> > +		  bool in, gpa_t addr, int len, u64 data, u8 state,
> > void *val)
> > +{
> > +	vcpu->ioregion_interrupted = true;
> > +
> > +	vcpu->ioregion_ctx.dev = this;
> > +	vcpu->ioregion_ctx.val = val;
> > +	vcpu->ioregion_ctx.state = state;
> > +	vcpu->ioregion_ctx.addr = addr;
> > +	vcpu->ioregion_ctx.len = len;
> > +	vcpu->ioregion_ctx.data = data;
> > +	vcpu->ioregion_ctx.in = in;
> > +}
> > +
> >   static int
> >   ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> >   	      int len, void *val)
> >   {
> > -	return -EOPNOTSUPP;
> > +	struct ioregion *p = to_ioregion(this);
> > +	union {
> > +		struct ioregionfd_cmd cmd;
> > +		struct ioregionfd_resp resp;
> > +	} buf;
> > +	int ret = 0;
> > +	int state = 0;
> 
> Let's use SEND_CMD otherwise it would be hard for the reviewers...
> 
Ok

> 
> > +
> > +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(vcpu->ioregion_interrupted)) {
> > +		vcpu->ioregion_interrupted = false;
> > +
> > +		switch (vcpu->ioregion_ctx.state) {
> > +		case SEND_CMD:
> > +			goto send_cmd;
> > +		case GET_REPLY:
> > +			goto get_repl;
> > +		case COMPLETE:
> 
> I fail to understand under what condition can we reach here?
> 
I was thinking about the case when a signal is received after obtaining
a reply. But it seems it’s unnecessary to consider this.

> 
> > +			memcpy(val, &vcpu->ioregion_ctx.data, len);
> > +			return 0;
> > +		}
> > +	}
> > +
> > +send_cmd:
> > +	memset(&buf, 0, sizeof(buf));
> > +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len,
> > IOREGIONFD_CMD_READ,
> > +		      1, p->user_data, NULL))
> > +		return -EOPNOTSUPP;
> > +
> > +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> > +	state = (ret == sizeof(buf.cmd));
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 1, addr, len, 0, state,
> > val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.cmd)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +get_repl:
> > +	memset(&buf, 0, sizeof(buf));
> > +	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> > +	state += (ret == sizeof(buf.resp));
> 
> Let's use enum instead of doing tricks like this.
> 
> Thanks
> 
> 
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 1, addr, len,
> > buf.resp.data, state, val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.resp)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +	memcpy(val, &buf.resp.data, len);
> > +
> > +	return 0;
> >   }
> >   
> >   static int
> >   ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> >   		int len, const void *val)
> >   {
> > -	return -EOPNOTSUPP;
> > +	struct ioregion *p = to_ioregion(this);
> > +	union {
> > +		struct ioregionfd_cmd cmd;
> > +		struct ioregionfd_resp resp;
> > +	} buf;
> > +	int ret = 0;
> > +	int state = 0;
> > +
> > +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(vcpu->ioregion_interrupted)) {
> > +		vcpu->ioregion_interrupted = false;
> > +
> > +		switch (vcpu->ioregion_ctx.state) {
> > +		case SEND_CMD:
> > +			goto send_cmd;
> > +		case GET_REPLY:
> > +			if (!p->posted_writes)
> > +				goto get_repl;
> > +			fallthrough;
> > +		case COMPLETE:
> > +			return 0;
> > +		}
> > +	}
> > +
> > +send_cmd:
> > +	memset(&buf, 0, sizeof(buf));
> > +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len,
> > IOREGIONFD_CMD_WRITE,
> > +		      p->posted_writes ? 0 : 1, p->user_data, val))
> > +		return -EOPNOTSUPP;
> > +
> > +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> > +	state = (ret == sizeof(buf.cmd));
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 0, addr, len,
> > +				  0, state, (void *)val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.cmd)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +get_repl:
> > +	if (!p->posted_writes) {
> > +		memset(&buf, 0, sizeof(buf));
> > +		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp),
> > 0);
> > +		state += (ret == sizeof(buf.resp));
> > +		if (signal_pending(current)) {
> > +			ioregion_save_ctx(vcpu, this, 0, addr, len,
> > +					  0, state, (void *)val);
> > +			return -EINTR;
> > +		}
> > +		if (ret != sizeof(buf.resp)) {
> > +			ret = (ret < 0) ? ret : -EIO;
> > +			return (ret == -EAGAIN || ret == -EWOULDBLOCK)
> > ? -EINVAL : ret;
> > +		}
> > +	}
> > +
> > +	return 0;
> >   }
> >   
> >   /*
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 88b92fc3da51..df387857f51f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   			      struct kvm_io_range *range, const void
> > *val)
> >   {
> >   	int idx;
> > +	int ret = 0;
> >   
> >   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
> >   	if (idx < 0)
> > @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct
> > kvm_vcpu *vcpu, struct kvm_io_bus *bus,
> >   
> >   	while (idx < bus->dev_count &&
> >   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> > -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > -					range->len, val))
> > +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > +					 range->len, val);
> > +		if (!ret)
> >   			return idx;
> > +		if (ret < 0 && ret != -EOPNOTSUPP)
> > +			return ret;
> >   		idx++;
> >   	}
> >   
> > @@ -4264,6 +4268,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   			     struct kvm_io_range *range, void *val)
> >   {
> >   	int idx;
> > +	int ret = 0;
> >   
> >   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
> >   	if (idx < 0)
> > @@ -4271,9 +4276,12 @@ static int __kvm_io_bus_read(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   
> >   	while (idx < bus->dev_count &&
> >   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> > -		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev,
> > range->addr,
> > -				       range->len, val))
> > +		ret = kvm_iodevice_read(vcpu, bus->range[idx].dev,
> > range->addr,
> > +					range->len, val);
> > +		if (!ret)
> >   			return idx;
> > +		if (ret < 0 && ret != -EOPNOTSUPP)
> > +			return ret;
> >   		idx++;
> >   	}
> >   


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-08  6:21   ` Jason Wang
  2021-02-09 14:59     ` Stefan Hajnoczi
@ 2021-02-10 19:31     ` Elena Afanasova
  2021-02-11 14:59       ` Stefan Hajnoczi
  2021-02-18  6:20       ` Jason Wang
  1 sibling, 2 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-02-10 19:31 UTC (permalink / raw)
  To: Jason Wang, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva

On Mon, 2021-02-08 at 14:21 +0800, Jason Wang wrote:
> On 2021/1/30 上午2:48, Elena Afanasova wrote:
> > This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> > read and write accesses are dispatched through the given ioregionfd
> > instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> > setting fds to -1.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> > Changes in v2:
> >    - changes after code review
> > 
> >   arch/x86/kvm/Kconfig     |   1 +
> >   arch/x86/kvm/Makefile    |   1 +
> >   arch/x86/kvm/x86.c       |   1 +
> >   include/linux/kvm_host.h |  17 +++
> >   include/uapi/linux/kvm.h |  23 ++++
> >   virt/kvm/Kconfig         |   3 +
> >   virt/kvm/eventfd.c       |  25 +++++
> >   virt/kvm/eventfd.h       |  14 +++
> >   virt/kvm/ioregion.c      | 232
> > +++++++++++++++++++++++++++++++++++++++
> >   virt/kvm/ioregion.h      |  15 +++
> >   virt/kvm/kvm_main.c      |  11 ++
> >   11 files changed, 343 insertions(+)
> >   create mode 100644 virt/kvm/eventfd.h
> >   create mode 100644 virt/kvm/ioregion.c
> >   create mode 100644 virt/kvm/ioregion.h
> > 
> > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > index f92dfd8ef10d..b914ef375199 100644
> > --- a/arch/x86/kvm/Kconfig
> > +++ b/arch/x86/kvm/Kconfig
> > @@ -33,6 +33,7 @@ config KVM
> >   	select HAVE_KVM_IRQ_BYPASS
> >   	select HAVE_KVM_IRQ_ROUTING
> >   	select HAVE_KVM_EVENTFD
> > +	select KVM_IOREGION
> >   	select KVM_ASYNC_PF
> >   	select USER_RETURN_NOTIFIER
> >   	select KVM_MMIO
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index b804444e16d4..b3b17dc9f7d4 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> >   kvm-y			+= $(KVM)/kvm_main.o
> > $(KVM)/coalesced_mmio.o \
> >   				$(KVM)/eventfd.o $(KVM)/irqchip.o
> > $(KVM)/vfio.o
> >   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> >   
> >   kvm-y			+= x86.o emulate.o i8259.o irq.o
> > lapic.o \
> >   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
> > mtrr.o \
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e545a8a613b1..ddb28f5ca252 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> > *kvm, long ext)
> >   	case KVM_CAP_X86_USER_SPACE_MSR:
> >   	case KVM_CAP_X86_MSR_FILTER:
> >   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > +	case KVM_CAP_IOREGIONFD:
> >   		r = 1;
> >   		break;
> >   	case KVM_CAP_SYNC_REGS:
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7f2e2a09ebbd..7cd667dddba9 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -470,6 +470,10 @@ struct kvm {
> >   		struct mutex      resampler_lock;
> >   	} irqfds;
> >   	struct list_head ioeventfds;
> > +#endif
> > +#ifdef CONFIG_KVM_IOREGION
> > +	struct list_head ioregions_mmio;
> > +	struct list_head ioregions_pio;
> >   #endif
> >   	struct kvm_vm_stat stat;
> >   	struct kvm_arch arch;
> > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
> > *kvm, struct kvm_ioeventfd *args)
> >   
> >   #endif /* CONFIG_HAVE_KVM_EVENTFD */
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +void kvm_ioregionfd_init(struct kvm *kvm);
> > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> > +
> > +#else
> > +
> > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > kvm_ioregion *args)
> > +{
> > +	return -ENOSYS;
> > +}
> > +#endif
> > +
> >   void kvm_arch_irq_routing_update(struct kvm *kvm);
> >   
> >   static inline void kvm_make_request(int req, struct kvm_vcpu
> > *vcpu)
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index ca41220b40b8..81e775778c66 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> >   	__u8  pad[36];
> >   };
> >   
> > +enum {
> > +	kvm_ioregion_flag_nr_pio,
> > +	kvm_ioregion_flag_nr_posted_writes,
> > +	kvm_ioregion_flag_nr_max,
> > +};
> > +
> > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > kvm_ioregion_flag_nr_posted_writes)
> > +
> > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > kvm_ioregion_flag_nr_max) - 1)
> > +
> > +struct kvm_ioregion {
> > +	__u64 guest_paddr; /* guest physical address */
> > +	__u64 memory_size; /* bytes */
> 
> Do we really need __u64 here?
> 
> 
> > +	__u64 user_data;
> > +	__s32 rfd;
> > +	__s32 wfd;
> > +	__u32 flags;
> > +	__u8  pad[28];
> > +};
> > +
> >   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> >   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> >   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> >   #define KVM_CAP_X86_USER_SPACE_MSR 188
> >   #define KVM_CAP_X86_MSR_FILTER 189
> >   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > +#define KVM_CAP_IOREGIONFD 191
> >   
> >   #ifdef KVM_CAP_IRQ_ROUTING
> >   
> > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> >   					struct
> > kvm_userspace_memory_region)
> >   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> >   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > kvm_ioregion)
> >   
> >   /* enable ucontrol for s390 */
> >   struct kvm_s390_ucas_mapping {
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 1c37ccd5d402..5e6620bbf000 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> >          bool
> >          select EVENTFD
> >   
> > +config KVM_IOREGION
> > +       bool
> > +
> >   config KVM_MMIO
> >          bool
> >   
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index c2323c27a28b..aadb73903f8b 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -27,6 +27,7 @@
> >   #include <trace/events/kvm.h>
> >   
> >   #include <kvm/iodev.h>
> > +#include "ioregion.h"
> >   
> >   #ifdef CONFIG_HAVE_KVM_IRQFD
> >   
> > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > ioeventfd_ops = {
> >   	.destructor = ioeventfd_destructor,
> >   };
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +/* assumes kvm->slots_lock held */
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > +			  u64 start, u64 size)
> > +{
> > +	struct _ioeventfd *_p;
> > +
> > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > +		if (_p->bus_idx == bus_idx &&
> > +		    overlap(start, size, _p->addr,
> > +			    !_p->length ? 8 : _p->length))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +#endif
> > +
> >   /* assumes kvm->slots_lock held */
> >   static bool
> >   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > struct _ioeventfd *p)
> >   		       _p->datamatch == p->datamatch))))
> >   			return true;
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> > +					  !p->length ? 8 : p->length))
> > +			return true;
> > +#endif
> > +
> >   	return false;
> >   }
> >   
> > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > new file mode 100644
> > index 000000000000..73a621eebae3
> > --- /dev/null
> > +++ b/virt/kvm/eventfd.h
> > @@ -0,0 +1,14 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_EVENTFD_H__
> > +#define __KVM_EVENTFD_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > u64 size);
> > +#else
> > +static inline bool
> > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > new file mode 100644
> > index 000000000000..48ff92bca966
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.c
> > @@ -0,0 +1,232 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/kvm_host.h>
> > +#include <linux/fs.h>
> > +#include <kvm/iodev.h>
> > +#include "eventfd.h"
> > +
> > +void
> > +kvm_ioregionfd_init(struct kvm *kvm)
> > +{
> > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > +}
> > +
> > +struct ioregion {
> > +	struct list_head     list;
> > +	u64                  paddr;  /* guest physical address */
> > +	u64                  size;   /* size in bytes */
> > +	struct file         *rf;
> > +	struct file         *wf;
> > +	u64                  user_data; /* opaque token used by
> > userspace */
> > +	struct kvm_io_device dev;
> > +	bool                 posted_writes;
> > +};
> > +
> > +static inline struct ioregion *
> > +to_ioregion(struct kvm_io_device *dev)
> > +{
> > +	return container_of(dev, struct ioregion, dev);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static void
> > +ioregion_release(struct ioregion *p)
> > +{
> > +	fput(p->rf);
> > +	fput(p->wf);
> > +	list_del(&p->list);
> > +	kfree(p);
> > +}
> > +
> > +static int
> > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +	      int len, void *val)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> > +
> > +static int
> > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> > +		int len, const void *val)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> > +
> > +/*
> > + * This function is called as KVM is completely shutting down.  We
> > do not
> > + * need to worry about locking just nuke anything we have as
> > quickly as possible
> > + */
> > +static void
> > +ioregion_destructor(struct kvm_io_device *this)
> > +{
> > +	struct ioregion *p = to_ioregion(this);
> > +
> > +	ioregion_release(p);
> > +}
> > +
> > +static const struct kvm_io_device_ops ioregion_ops = {
> > +	.read       = ioregion_read,
> > +	.write      = ioregion_write,
> > +	.destructor = ioregion_destructor,
> > +};
> > +
> > +static inline struct list_head *
> > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > +{
> > +	return (bus_idx == KVM_MMIO_BUS) ?
> > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > +}
> > +
> > +/* check for not overlapping case and reverse */
> > +inline bool
> > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > +{
> > +	u64 end1 = start1 + size1 - 1;
> > +	u64 end2 = start2 + size2 - 1;
> > +
> > +	return !(end1 < start2 || start1 >= end2);
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > +		      u64 start, u64 size)
> > +{
> > +	struct ioregion *_p;
> > +	struct list_head *ioregions;
> > +
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +	list_for_each_entry(_p, ioregions, list)
> > +		if (overlap(start, size, _p->paddr, _p->size))
> > +			return true;
> > +
> > +	return false;
> > +}
> > +
> > +/* assumes kvm->slots_lock held */
> > +static bool
> > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > kvm_bus bus_idx)
> > +{
> > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +static enum kvm_bus
> > +get_bus_from_flags(__u32 flags)
> > +{
> > +	if (flags & KVM_IOREGION_PIO)
> > +		return KVM_PIO_BUS;
> > +	return KVM_MMIO_BUS;
> > +}
> > +
> > +int
> > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion *p;
> > +	struct file *rfile, *wfile;
> > +	enum kvm_bus bus_idx;
> > +	int ret = 0;
> > +
> > +	if (!args->memory_size)
> > +		return -EINVAL;
> > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > >guest_paddr)
> > +		return -EINVAL;
> > +
> > +	rfile = fget(args->rfd);
> > +	if (!rfile)
> > +		return -EBADF;
> 
> So the question still, if we want to use ioregion fd for doorbell,
> we 
> don't need rfd in this case?
> 
Using ioregionfd for doorbell seems to be an open question. Probably it
could just focus on the non-doorbell cases.

> 
> > +	wfile = fget(args->wfd);
> > +	if (!wfile) {
> > +		fput(rfile);
> > +		return -EBADF;
> > +	}
> > +	if ((rfile->f_flags & O_NONBLOCK) || (wfile->f_flags &
> > O_NONBLOCK)) {
> > +		ret = -EINVAL;
> > +		goto fail;
> > +	}
> 
> I wonder how much value if we stick a check like this here (if our
> code 
> can gracefully deal with blocking fd).
> 
Do you think it would be better to remove this check and just mention
that in a comment or documentation?

> 
> > +	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> > +	if (!p) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&p->list);
> > +	p->paddr = args->guest_paddr;
> > +	p->size = args->memory_size;
> > +	p->user_data = args->user_data;
> > +	p->rf = rfile;
> > +	p->wf = wfile;
> > +	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	if (ioregion_collision(kvm, p, bus_idx)) {
> > +		ret = -EEXIST;
> > +		goto unlock_fail;
> > +	}
> > +	kvm_iodevice_init(&p->dev, &ioregion_ops);
> > +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
> > +				      &p->dev);
> 
> I think we agree on previous version that we need to deal with
> FAST_MMIO 
> bus here?
> 
Yes, I’ll include FAST_MMIO support in a RFC v3 series.

> 
> > +	if (ret < 0)
> > +		goto unlock_fail;
> > +	list_add_tail(&p->list, get_ioregion_list(kvm, bus_idx));
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return 0;
> > +
> > +unlock_fail:
> > +	mutex_unlock(&kvm->slots_lock);
> > +	kfree(p);
> > +fail:
> > +	fput(rfile);
> > +	fput(wfile);
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +kvm_rm_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	struct ioregion         *p, *tmp;
> > +	enum kvm_bus             bus_idx;
> > +	int                      ret = -ENOENT;
> > +	struct list_head        *ioregions;
> > +
> > +	if (args->rfd != -1 || args->wfd != -1)
> > +		return -EINVAL;
> > +
> > +	bus_idx = get_bus_from_flags(args->flags);
> > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	list_for_each_entry_safe(p, tmp, ioregions, list) {
> > +		if (p->paddr == args->guest_paddr  &&
> > +		    p->size == args->memory_size) {
> > +			kvm_io_bus_unregister_dev(kvm, bus_idx, &p-
> > >dev);
> > +			ioregion_release(p);
> > +			ret = 0;
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +int
> > +kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args)
> > +{
> > +	if (args->flags & ~KVM_IOREGION_VALID_FLAG_MASK)
> > +		return -EINVAL;
> > +	if (args->rfd == -1 || args->wfd == -1)
> > +		return kvm_rm_ioregion(kvm, args);
> > +
> > +	return kvm_set_ioregion(kvm, args);
> > +}
> > diff --git a/virt/kvm/ioregion.h b/virt/kvm/ioregion.h
> > new file mode 100644
> > index 000000000000..23ffa812ec7a
> > --- /dev/null
> > +++ b/virt/kvm/ioregion.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __KVM_IOREGION_H__
> > +#define __KVM_IOREGION_H__
> > +
> > +#ifdef CONFIG_KVM_IOREGION
> > +inline bool overlap(u64 start1, u64 size1, u64 start2, u64 size2);
> > +bool kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64
> > start, u64 size);
> > +#else
> > +static inline bool
> > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > size)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +#endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 2541a17ff1c4..88b92fc3da51 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -747,6 +747,7 @@ static struct kvm *kvm_create_vm(unsigned long
> > type)
> >   	mmgrab(current->mm);
> >   	kvm->mm = current->mm;
> >   	kvm_eventfd_init(kvm);
> > +	kvm_ioregionfd_init(kvm);
> >   	mutex_init(&kvm->lock);
> >   	mutex_init(&kvm->irq_lock);
> >   	mutex_init(&kvm->slots_lock);
> > @@ -3708,6 +3709,16 @@ static long kvm_vm_ioctl(struct file *filp,
> >   		r = kvm_vm_ioctl_set_memory_region(kvm,
> > &kvm_userspace_mem);
> >   		break;
> >   	}
> > +	case KVM_SET_IOREGION: {
> > +		struct kvm_ioregion data;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&data, argp, sizeof(data)))
> > +			goto out;
> > +
> > +		r = kvm_ioregionfd(kvm, &data);
> > +		break;
> > +	}
> >   	case KVM_GET_DIRTY_LOG: {
> >   		struct kvm_dirty_log log;
> >   


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-10 19:31     ` Elena Afanasova
@ 2021-02-11 14:59       ` Stefan Hajnoczi
  2021-02-17 23:05         ` Elena Afanasova
  2021-02-18  6:22         ` Jason Wang
  2021-02-18  6:20       ` Jason Wang
  1 sibling, 2 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2021-02-11 14:59 UTC (permalink / raw)
  To: Elena Afanasova; +Cc: Jason Wang, kvm, jag.raman, elena.ufimtseva

[-- Attachment #1: Type: text/plain, Size: 14702 bytes --]

On Wed, Feb 10, 2021 at 11:31:30AM -0800, Elena Afanasova wrote:
> On Mon, 2021-02-08 at 14:21 +0800, Jason Wang wrote:
> > On 2021/1/30 上午2:48, Elena Afanasova wrote:
> > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
> > > read and write accesses are dispatched through the given ioregionfd
> > > instead of returning from ioctl(KVM_RUN). Regions can be deleted by
> > > setting fds to -1.
> > > 
> > > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > > ---
> > > Changes in v2:
> > >    - changes after code review
> > > 
> > >   arch/x86/kvm/Kconfig     |   1 +
> > >   arch/x86/kvm/Makefile    |   1 +
> > >   arch/x86/kvm/x86.c       |   1 +
> > >   include/linux/kvm_host.h |  17 +++
> > >   include/uapi/linux/kvm.h |  23 ++++
> > >   virt/kvm/Kconfig         |   3 +
> > >   virt/kvm/eventfd.c       |  25 +++++
> > >   virt/kvm/eventfd.h       |  14 +++
> > >   virt/kvm/ioregion.c      | 232
> > > +++++++++++++++++++++++++++++++++++++++
> > >   virt/kvm/ioregion.h      |  15 +++
> > >   virt/kvm/kvm_main.c      |  11 ++
> > >   11 files changed, 343 insertions(+)
> > >   create mode 100644 virt/kvm/eventfd.h
> > >   create mode 100644 virt/kvm/ioregion.c
> > >   create mode 100644 virt/kvm/ioregion.h
> > > 
> > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > > index f92dfd8ef10d..b914ef375199 100644
> > > --- a/arch/x86/kvm/Kconfig
> > > +++ b/arch/x86/kvm/Kconfig
> > > @@ -33,6 +33,7 @@ config KVM
> > >   	select HAVE_KVM_IRQ_BYPASS
> > >   	select HAVE_KVM_IRQ_ROUTING
> > >   	select HAVE_KVM_EVENTFD
> > > +	select KVM_IOREGION
> > >   	select KVM_ASYNC_PF
> > >   	select USER_RETURN_NOTIFIER
> > >   	select KVM_MMIO
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index b804444e16d4..b3b17dc9f7d4 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> > >   kvm-y			+= $(KVM)/kvm_main.o
> > > $(KVM)/coalesced_mmio.o \
> > >   				$(KVM)/eventfd.o $(KVM)/irqchip.o
> > > $(KVM)/vfio.o
> > >   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> > >   
> > >   kvm-y			+= x86.o emulate.o i8259.o irq.o
> > > lapic.o \
> > >   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
> > > mtrr.o \
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index e545a8a613b1..ddb28f5ca252 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> > > *kvm, long ext)
> > >   	case KVM_CAP_X86_USER_SPACE_MSR:
> > >   	case KVM_CAP_X86_MSR_FILTER:
> > >   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > > +	case KVM_CAP_IOREGIONFD:
> > >   		r = 1;
> > >   		break;
> > >   	case KVM_CAP_SYNC_REGS:
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 7f2e2a09ebbd..7cd667dddba9 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -470,6 +470,10 @@ struct kvm {
> > >   		struct mutex      resampler_lock;
> > >   	} irqfds;
> > >   	struct list_head ioeventfds;
> > > +#endif
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +	struct list_head ioregions_mmio;
> > > +	struct list_head ioregions_pio;
> > >   #endif
> > >   	struct kvm_vm_stat stat;
> > >   	struct kvm_arch arch;
> > > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
> > > *kvm, struct kvm_ioeventfd *args)
> > >   
> > >   #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > >   
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +void kvm_ioregionfd_init(struct kvm *kvm);
> > > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
> > > +
> > > +#else
> > > +
> > > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > > kvm_ioregion *args)
> > > +{
> > > +	return -ENOSYS;
> > > +}
> > > +#endif
> > > +
> > >   void kvm_arch_irq_routing_update(struct kvm *kvm);
> > >   
> > >   static inline void kvm_make_request(int req, struct kvm_vcpu
> > > *vcpu)
> > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > index ca41220b40b8..81e775778c66 100644
> > > --- a/include/uapi/linux/kvm.h
> > > +++ b/include/uapi/linux/kvm.h
> > > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> > >   	__u8  pad[36];
> > >   };
> > >   
> > > +enum {
> > > +	kvm_ioregion_flag_nr_pio,
> > > +	kvm_ioregion_flag_nr_posted_writes,
> > > +	kvm_ioregion_flag_nr_max,
> > > +};
> > > +
> > > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > > kvm_ioregion_flag_nr_posted_writes)
> > > +
> > > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > > kvm_ioregion_flag_nr_max) - 1)
> > > +
> > > +struct kvm_ioregion {
> > > +	__u64 guest_paddr; /* guest physical address */
> > > +	__u64 memory_size; /* bytes */
> > 
> > Do we really need __u64 here?
> > 
> > 
> > > +	__u64 user_data;
> > > +	__s32 rfd;
> > > +	__s32 wfd;
> > > +	__u32 flags;
> > > +	__u8  pad[28];
> > > +};
> > > +
> > >   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> > >   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> > >   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> > >   #define KVM_CAP_X86_USER_SPACE_MSR 188
> > >   #define KVM_CAP_X86_MSR_FILTER 189
> > >   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > > +#define KVM_CAP_IOREGIONFD 191
> > >   
> > >   #ifdef KVM_CAP_IRQ_ROUTING
> > >   
> > > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> > >   					struct
> > > kvm_userspace_memory_region)
> > >   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> > >   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > > kvm_ioregion)
> > >   
> > >   /* enable ucontrol for s390 */
> > >   struct kvm_s390_ucas_mapping {
> > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > > index 1c37ccd5d402..5e6620bbf000 100644
> > > --- a/virt/kvm/Kconfig
> > > +++ b/virt/kvm/Kconfig
> > > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> > >          bool
> > >          select EVENTFD
> > >   
> > > +config KVM_IOREGION
> > > +       bool
> > > +
> > >   config KVM_MMIO
> > >          bool
> > >   
> > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > index c2323c27a28b..aadb73903f8b 100644
> > > --- a/virt/kvm/eventfd.c
> > > +++ b/virt/kvm/eventfd.c
> > > @@ -27,6 +27,7 @@
> > >   #include <trace/events/kvm.h>
> > >   
> > >   #include <kvm/iodev.h>
> > > +#include "ioregion.h"
> > >   
> > >   #ifdef CONFIG_HAVE_KVM_IRQFD
> > >   
> > > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > > ioeventfd_ops = {
> > >   	.destructor = ioeventfd_destructor,
> > >   };
> > >   
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +/* assumes kvm->slots_lock held */
> > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > > +			  u64 start, u64 size)
> > > +{
> > > +	struct _ioeventfd *_p;
> > > +
> > > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > > +		if (_p->bus_idx == bus_idx &&
> > > +		    overlap(start, size, _p->addr,
> > > +			    !_p->length ? 8 : _p->length))
> > > +			return true;
> > > +
> > > +	return false;
> > > +}
> > > +#endif
> > > +
> > >   /* assumes kvm->slots_lock held */
> > >   static bool
> > >   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> > > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > > struct _ioeventfd *p)
> > >   		       _p->datamatch == p->datamatch))))
> > >   			return true;
> > >   
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
> > > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
> > > +					  !p->length ? 8 : p->length))
> > > +			return true;
> > > +#endif
> > > +
> > >   	return false;
> > >   }
> > >   
> > > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > > new file mode 100644
> > > index 000000000000..73a621eebae3
> > > --- /dev/null
> > > +++ b/virt/kvm/eventfd.h
> > > @@ -0,0 +1,14 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +#ifndef __KVM_EVENTFD_H__
> > > +#define __KVM_EVENTFD_H__
> > > +
> > > +#ifdef CONFIG_KVM_IOREGION
> > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > > u64 size);
> > > +#else
> > > +static inline bool
> > > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
> > > size)
> > > +{
> > > +	return false;
> > > +}
> > > +#endif
> > > +#endif
> > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > > new file mode 100644
> > > index 000000000000..48ff92bca966
> > > --- /dev/null
> > > +++ b/virt/kvm/ioregion.c
> > > @@ -0,0 +1,232 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/fs.h>
> > > +#include <kvm/iodev.h>
> > > +#include "eventfd.h"
> > > +
> > > +void
> > > +kvm_ioregionfd_init(struct kvm *kvm)
> > > +{
> > > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > > +}
> > > +
> > > +struct ioregion {
> > > +	struct list_head     list;
> > > +	u64                  paddr;  /* guest physical address */
> > > +	u64                  size;   /* size in bytes */
> > > +	struct file         *rf;
> > > +	struct file         *wf;
> > > +	u64                  user_data; /* opaque token used by
> > > userspace */
> > > +	struct kvm_io_device dev;
> > > +	bool                 posted_writes;
> > > +};
> > > +
> > > +static inline struct ioregion *
> > > +to_ioregion(struct kvm_io_device *dev)
> > > +{
> > > +	return container_of(dev, struct ioregion, dev);
> > > +}
> > > +
> > > +/* assumes kvm->slots_lock held */
> > > +static void
> > > +ioregion_release(struct ioregion *p)
> > > +{
> > > +	fput(p->rf);
> > > +	fput(p->wf);
> > > +	list_del(&p->list);
> > > +	kfree(p);
> > > +}
> > > +
> > > +static int
> > > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > > gpa_t addr,
> > > +	      int len, void *val)
> > > +{
> > > +	return -EOPNOTSUPP;
> > > +}
> > > +
> > > +static int
> > > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > > gpa_t addr,
> > > +		int len, const void *val)
> > > +{
> > > +	return -EOPNOTSUPP;
> > > +}
> > > +
> > > +/*
> > > + * This function is called as KVM is completely shutting down.  We
> > > do not
> > > + * need to worry about locking just nuke anything we have as
> > > quickly as possible
> > > + */
> > > +static void
> > > +ioregion_destructor(struct kvm_io_device *this)
> > > +{
> > > +	struct ioregion *p = to_ioregion(this);
> > > +
> > > +	ioregion_release(p);
> > > +}
> > > +
> > > +static const struct kvm_io_device_ops ioregion_ops = {
> > > +	.read       = ioregion_read,
> > > +	.write      = ioregion_write,
> > > +	.destructor = ioregion_destructor,
> > > +};
> > > +
> > > +static inline struct list_head *
> > > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > > +{
> > > +	return (bus_idx == KVM_MMIO_BUS) ?
> > > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > > +}
> > > +
> > > +/* check for not overlapping case and reverse */
> > > +inline bool
> > > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > > +{
> > > +	u64 end1 = start1 + size1 - 1;
> > > +	u64 end2 = start2 + size2 - 1;
> > > +
> > > +	return !(end1 < start2 || start1 >= end2);
> > > +}
> > > +
> > > +/* assumes kvm->slots_lock held */
> > > +bool
> > > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > > +		      u64 start, u64 size)
> > > +{
> > > +	struct ioregion *_p;
> > > +	struct list_head *ioregions;
> > > +
> > > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > > +	list_for_each_entry(_p, ioregions, list)
> > > +		if (overlap(start, size, _p->paddr, _p->size))
> > > +			return true;
> > > +
> > > +	return false;
> > > +}
> > > +
> > > +/* assumes kvm->slots_lock held */
> > > +static bool
> > > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > > kvm_bus bus_idx)
> > > +{
> > > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
> > > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
> > > +		return true;
> > > +
> > > +	return false;
> > > +}
> > > +
> > > +static enum kvm_bus
> > > +get_bus_from_flags(__u32 flags)
> > > +{
> > > +	if (flags & KVM_IOREGION_PIO)
> > > +		return KVM_PIO_BUS;
> > > +	return KVM_MMIO_BUS;
> > > +}
> > > +
> > > +int
> > > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > > +{
> > > +	struct ioregion *p;
> > > +	struct file *rfile, *wfile;
> > > +	enum kvm_bus bus_idx;
> > > +	int ret = 0;
> > > +
> > > +	if (!args->memory_size)
> > > +		return -EINVAL;
> > > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > > >guest_paddr)
> > > +		return -EINVAL;
> > > +
> > > +	rfile = fget(args->rfd);
> > > +	if (!rfile)
> > > +		return -EBADF;
> > 
> > So the question still, if we want to use ioregion fd for doorbell,
> > we 
> > don't need rfd in this case?
> > 
> Using ioregionfd for doorbell seems to be an open question. Probably it
> could just focus on the non-doorbell cases.

Below you replied FAST_MMIO will be in v3. That is the doorbell case, so
maybe it is in scope for this patch series?

I think continuing to use ioeventfd for most doorbell registers makes
sense.

However, there are two cases where ioregionfd doorbell support is
interesting:

1. The (non-FAST_MMIO) case where the application needs to know the
   value written to the doorbell. ioeventfd cannot do this (datamatch
   can handle a subset of cases but not all) so we need ioregionfd for
   this.

2. The FAST_MMIO case just for convenience if applications prefer to use
   a single API (ioregionfd) instead of implementing both ioregionfd and
   ioeventfd.

ioeventfd will still have its benefits (and limitations) that make it
different from ioregionfd. In particular, ioregionfd will not merge
doorbell writes into a single message because doing so would basically
involve reimplementing ioeventfd functionality as part of ioregionfd and
isn't compatible with the current approach where userspace can provide
any file descriptor for communication.

Elena and Jason: do you agree with this API design?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-11 14:59       ` Stefan Hajnoczi
@ 2021-02-17 23:05         ` Elena Afanasova
  2021-02-18  6:22         ` Jason Wang
  1 sibling, 0 replies; 28+ messages in thread
From: Elena Afanasova @ 2021-02-17 23:05 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Jason Wang, kvm, jag.raman, elena.ufimtseva

On Thu, 2021-02-11 at 14:59 +0000, Stefan Hajnoczi wrote:
> On Wed, Feb 10, 2021 at 11:31:30AM -0800, Elena Afanasova wrote:
> > On Mon, 2021-02-08 at 14:21 +0800, Jason Wang wrote:
> > > On 2021/1/30 上午2:48, Elena Afanasova wrote:
> > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region.
> > > > Guest
> > > > read and write accesses are dispatched through the given
> > > > ioregionfd
> > > > instead of returning from ioctl(KVM_RUN). Regions can be
> > > > deleted by
> > > > setting fds to -1.
> > > > 
> > > > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > > > ---
> > > > Changes in v2:
> > > >    - changes after code review
> > > > 
> > > >   arch/x86/kvm/Kconfig     |   1 +
> > > >   arch/x86/kvm/Makefile    |   1 +
> > > >   arch/x86/kvm/x86.c       |   1 +
> > > >   include/linux/kvm_host.h |  17 +++
> > > >   include/uapi/linux/kvm.h |  23 ++++
> > > >   virt/kvm/Kconfig         |   3 +
> > > >   virt/kvm/eventfd.c       |  25 +++++
> > > >   virt/kvm/eventfd.h       |  14 +++
> > > >   virt/kvm/ioregion.c      | 232
> > > > +++++++++++++++++++++++++++++++++++++++
> > > >   virt/kvm/ioregion.h      |  15 +++
> > > >   virt/kvm/kvm_main.c      |  11 ++
> > > >   11 files changed, 343 insertions(+)
> > > >   create mode 100644 virt/kvm/eventfd.h
> > > >   create mode 100644 virt/kvm/ioregion.c
> > > >   create mode 100644 virt/kvm/ioregion.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > > > index f92dfd8ef10d..b914ef375199 100644
> > > > --- a/arch/x86/kvm/Kconfig
> > > > +++ b/arch/x86/kvm/Kconfig
> > > > @@ -33,6 +33,7 @@ config KVM
> > > >   	select HAVE_KVM_IRQ_BYPASS
> > > >   	select HAVE_KVM_IRQ_ROUTING
> > > >   	select HAVE_KVM_EVENTFD
> > > > +	select KVM_IOREGION
> > > >   	select KVM_ASYNC_PF
> > > >   	select USER_RETURN_NOTIFIER
> > > >   	select KVM_MMIO
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index b804444e16d4..b3b17dc9f7d4 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
> > > >   kvm-y			+= $(KVM)/kvm_main.o
> > > > $(KVM)/coalesced_mmio.o \
> > > >   				$(KVM)/eventfd.o
> > > > $(KVM)/irqchip.o
> > > > $(KVM)/vfio.o
> > > >   kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
> > > > +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
> > > >   
> > > >   kvm-y			+= x86.o emulate.o i8259.o irq.o
> > > > lapic.o \
> > > >   			   i8254.o ioapic.o irq_comm.o cpuid.o
> > > > pmu.o
> > > > mtrr.o \
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index e545a8a613b1..ddb28f5ca252 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct
> > > > kvm
> > > > *kvm, long ext)
> > > >   	case KVM_CAP_X86_USER_SPACE_MSR:
> > > >   	case KVM_CAP_X86_MSR_FILTER:
> > > >   	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > > > +	case KVM_CAP_IOREGIONFD:
> > > >   		r = 1;
> > > >   		break;
> > > >   	case KVM_CAP_SYNC_REGS:
> > > > diff --git a/include/linux/kvm_host.h
> > > > b/include/linux/kvm_host.h
> > > > index 7f2e2a09ebbd..7cd667dddba9 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -470,6 +470,10 @@ struct kvm {
> > > >   		struct mutex      resampler_lock;
> > > >   	} irqfds;
> > > >   	struct list_head ioeventfds;
> > > > +#endif
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +	struct list_head ioregions_mmio;
> > > > +	struct list_head ioregions_pio;
> > > >   #endif
> > > >   	struct kvm_vm_stat stat;
> > > >   	struct kvm_arch arch;
> > > > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct
> > > > kvm
> > > > *kvm, struct kvm_ioeventfd *args)
> > > >   
> > > >   #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > >   
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +void kvm_ioregionfd_init(struct kvm *kvm);
> > > > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion
> > > > *args);
> > > > +
> > > > +#else
> > > > +
> > > > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
> > > > +static inline int kvm_ioregionfd(struct kvm *kvm, struct
> > > > kvm_ioregion *args)
> > > > +{
> > > > +	return -ENOSYS;
> > > > +}
> > > > +#endif
> > > > +
> > > >   void kvm_arch_irq_routing_update(struct kvm *kvm);
> > > >   
> > > >   static inline void kvm_make_request(int req, struct kvm_vcpu
> > > > *vcpu)
> > > > diff --git a/include/uapi/linux/kvm.h
> > > > b/include/uapi/linux/kvm.h
> > > > index ca41220b40b8..81e775778c66 100644
> > > > --- a/include/uapi/linux/kvm.h
> > > > +++ b/include/uapi/linux/kvm.h
> > > > @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
> > > >   	__u8  pad[36];
> > > >   };
> > > >   
> > > > +enum {
> > > > +	kvm_ioregion_flag_nr_pio,
> > > > +	kvm_ioregion_flag_nr_posted_writes,
> > > > +	kvm_ioregion_flag_nr_max,
> > > > +};
> > > > +
> > > > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
> > > > +#define KVM_IOREGION_POSTED_WRITES (1 <<
> > > > kvm_ioregion_flag_nr_posted_writes)
> > > > +
> > > > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
> > > > kvm_ioregion_flag_nr_max) - 1)
> > > > +
> > > > +struct kvm_ioregion {
> > > > +	__u64 guest_paddr; /* guest physical address */
> > > > +	__u64 memory_size; /* bytes */
> > > 
> > > Do we really need __u64 here?
> > > 
> > > 
> > > > +	__u64 user_data;
> > > > +	__s32 rfd;
> > > > +	__s32 wfd;
> > > > +	__u32 flags;
> > > > +	__u8  pad[28];
> > > > +};
> > > > +
> > > >   #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
> > > >   #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
> > > >   #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
> > > > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
> > > >   #define KVM_CAP_X86_USER_SPACE_MSR 188
> > > >   #define KVM_CAP_X86_MSR_FILTER 189
> > > >   #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> > > > +#define KVM_CAP_IOREGIONFD 191
> > > >   
> > > >   #ifdef KVM_CAP_IRQ_ROUTING
> > > >   
> > > > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
> > > >   					struct
> > > > kvm_userspace_memory_region)
> > > >   #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
> > > >   #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
> > > > +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
> > > > kvm_ioregion)
> > > >   
> > > >   /* enable ucontrol for s390 */
> > > >   struct kvm_s390_ucas_mapping {
> > > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > > > index 1c37ccd5d402..5e6620bbf000 100644
> > > > --- a/virt/kvm/Kconfig
> > > > +++ b/virt/kvm/Kconfig
> > > > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
> > > >          bool
> > > >          select EVENTFD
> > > >   
> > > > +config KVM_IOREGION
> > > > +       bool
> > > > +
> > > >   config KVM_MMIO
> > > >          bool
> > > >   
> > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > index c2323c27a28b..aadb73903f8b 100644
> > > > --- a/virt/kvm/eventfd.c
> > > > +++ b/virt/kvm/eventfd.c
> > > > @@ -27,6 +27,7 @@
> > > >   #include <trace/events/kvm.h>
> > > >   
> > > >   #include <kvm/iodev.h>
> > > > +#include "ioregion.h"
> > > >   
> > > >   #ifdef CONFIG_HAVE_KVM_IRQFD
> > > >   
> > > > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
> > > > ioeventfd_ops = {
> > > >   	.destructor = ioeventfd_destructor,
> > > >   };
> > > >   
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +/* assumes kvm->slots_lock held */
> > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
> > > > +			  u64 start, u64 size)
> > > > +{
> > > > +	struct _ioeventfd *_p;
> > > > +
> > > > +	list_for_each_entry(_p, &kvm->ioeventfds, list)
> > > > +		if (_p->bus_idx == bus_idx &&
> > > > +		    overlap(start, size, _p->addr,
> > > > +			    !_p->length ? 8 : _p->length))
> > > > +			return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +
> > > >   /* assumes kvm->slots_lock held */
> > > >   static bool
> > > >   ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd
> > > > *p)
> > > > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
> > > > struct _ioeventfd *p)
> > > >   		       _p->datamatch == p->datamatch))))
> > > >   			return true;
> > > >   
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx ==
> > > > KVM_PIO_BUS)
> > > > +		if (kvm_ioregion_collides(kvm, p->bus_idx, p-
> > > > >addr,
> > > > +					  !p->length ? 8 : p-
> > > > >length))
> > > > +			return true;
> > > > +#endif
> > > > +
> > > >   	return false;
> > > >   }
> > > >   
> > > > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
> > > > new file mode 100644
> > > > index 000000000000..73a621eebae3
> > > > --- /dev/null
> > > > +++ b/virt/kvm/eventfd.h
> > > > @@ -0,0 +1,14 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +#ifndef __KVM_EVENTFD_H__
> > > > +#define __KVM_EVENTFD_H__
> > > > +
> > > > +#ifdef CONFIG_KVM_IOREGION
> > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64
> > > > start,
> > > > u64 size);
> > > > +#else
> > > > +static inline bool
> > > > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
> > > > u64
> > > > size)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +#endif
> > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > > > new file mode 100644
> > > > index 000000000000..48ff92bca966
> > > > --- /dev/null
> > > > +++ b/virt/kvm/ioregion.c
> > > > @@ -0,0 +1,232 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/fs.h>
> > > > +#include <kvm/iodev.h>
> > > > +#include "eventfd.h"
> > > > +
> > > > +void
> > > > +kvm_ioregionfd_init(struct kvm *kvm)
> > > > +{
> > > > +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
> > > > +	INIT_LIST_HEAD(&kvm->ioregions_pio);
> > > > +}
> > > > +
> > > > +struct ioregion {
> > > > +	struct list_head     list;
> > > > +	u64                  paddr;  /* guest physical address
> > > > */
> > > > +	u64                  size;   /* size in bytes */
> > > > +	struct file         *rf;
> > > > +	struct file         *wf;
> > > > +	u64                  user_data; /* opaque token used by
> > > > userspace */
> > > > +	struct kvm_io_device dev;
> > > > +	bool                 posted_writes;
> > > > +};
> > > > +
> > > > +static inline struct ioregion *
> > > > +to_ioregion(struct kvm_io_device *dev)
> > > > +{
> > > > +	return container_of(dev, struct ioregion, dev);
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +static void
> > > > +ioregion_release(struct ioregion *p)
> > > > +{
> > > > +	fput(p->rf);
> > > > +	fput(p->wf);
> > > > +	list_del(&p->list);
> > > > +	kfree(p);
> > > > +}
> > > > +
> > > > +static int
> > > > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device
> > > > *this,
> > > > gpa_t addr,
> > > > +	      int len, void *val)
> > > > +{
> > > > +	return -EOPNOTSUPP;
> > > > +}
> > > > +
> > > > +static int
> > > > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device
> > > > *this,
> > > > gpa_t addr,
> > > > +		int len, const void *val)
> > > > +{
> > > > +	return -EOPNOTSUPP;
> > > > +}
> > > > +
> > > > +/*
> > > > + * This function is called as KVM is completely shutting
> > > > down.  We
> > > > do not
> > > > + * need to worry about locking just nuke anything we have as
> > > > quickly as possible
> > > > + */
> > > > +static void
> > > > +ioregion_destructor(struct kvm_io_device *this)
> > > > +{
> > > > +	struct ioregion *p = to_ioregion(this);
> > > > +
> > > > +	ioregion_release(p);
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops ioregion_ops = {
> > > > +	.read       = ioregion_read,
> > > > +	.write      = ioregion_write,
> > > > +	.destructor = ioregion_destructor,
> > > > +};
> > > > +
> > > > +static inline struct list_head *
> > > > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
> > > > +{
> > > > +	return (bus_idx == KVM_MMIO_BUS) ?
> > > > +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
> > > > +}
> > > > +
> > > > +/* check for not overlapping case and reverse */
> > > > +inline bool
> > > > +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
> > > > +{
> > > > +	u64 end1 = start1 + size1 - 1;
> > > > +	u64 end2 = start2 + size2 - 1;
> > > > +
> > > > +	return !(end1 < start2 || start1 >= end2);
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +bool
> > > > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
> > > > +		      u64 start, u64 size)
> > > > +{
> > > > +	struct ioregion *_p;
> > > > +	struct list_head *ioregions;
> > > > +
> > > > +	ioregions = get_ioregion_list(kvm, bus_idx);
> > > > +	list_for_each_entry(_p, ioregions, list)
> > > > +		if (overlap(start, size, _p->paddr, _p->size))
> > > > +			return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +
> > > > +/* assumes kvm->slots_lock held */
> > > > +static bool
> > > > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
> > > > kvm_bus bus_idx)
> > > > +{
> > > > +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p-
> > > > >size) ||
> > > > +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p-
> > > > >size))
> > > > +		return true;
> > > > +
> > > > +	return false;
> > > > +}
> > > > +
> > > > +static enum kvm_bus
> > > > +get_bus_from_flags(__u32 flags)
> > > > +{
> > > > +	if (flags & KVM_IOREGION_PIO)
> > > > +		return KVM_PIO_BUS;
> > > > +	return KVM_MMIO_BUS;
> > > > +}
> > > > +
> > > > +int
> > > > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
> > > > +{
> > > > +	struct ioregion *p;
> > > > +	struct file *rfile, *wfile;
> > > > +	enum kvm_bus bus_idx;
> > > > +	int ret = 0;
> > > > +
> > > > +	if (!args->memory_size)
> > > > +		return -EINVAL;
> > > > +	if ((args->guest_paddr + args->memory_size - 1) < args-
> > > > > guest_paddr)
> > > > +		return -EINVAL;
> > > > +
> > > > +	rfile = fget(args->rfd);
> > > > +	if (!rfile)
> > > > +		return -EBADF;
> > > 
> > > So the question still, if we want to use ioregion fd for
> > > doorbell,
> > > we 
> > > don't need rfd in this case?
> > > 
> > Using ioregionfd for doorbell seems to be an open question.
> > Probably it
> > could just focus on the non-doorbell cases.
> 
> Below you replied FAST_MMIO will be in v3. That is the doorbell case,
> so
> maybe it is in scope for this patch series?
> 
Ok, will fix

> I think continuing to use ioeventfd for most doorbell registers makes
> sense.
> 
> However, there are two cases where ioregionfd doorbell support is
> interesting:
> 
> 1. The (non-FAST_MMIO) case where the application needs to know the
>    value written to the doorbell. ioeventfd cannot do this (datamatch
>    can handle a subset of cases but not all) so we need ioregionfd
> for
>    this.
> 
> 2. The FAST_MMIO case just for convenience if applications prefer to
> use
>    a single API (ioregionfd) instead of implementing both ioregionfd
> and
>    ioeventfd.
> 
> ioeventfd will still have its benefits (and limitations) that make it
> different from ioregionfd. In particular, ioregionfd will not merge
> doorbell writes into a single message because doing so would
> basically
> involve reimplementing ioeventfd functionality as part of ioregionfd
> and
> isn't compatible with the current approach where userspace can
> provide
> any file descriptor for communication.
> 
> Elena and Jason: do you agree with this API design?

I’m still not sure about coalescing the writes support, but in general
it looks ok to me.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-09 14:59     ` Stefan Hajnoczi
@ 2021-02-18  6:17       ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-18  6:17 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Elena Afanasova, kvm, jag.raman, elena.ufimtseva


On 2021/2/9 下午10:59, Stefan Hajnoczi wrote:
> On Mon, Feb 08, 2021 at 02:21:35PM +0800, Jason Wang wrote:
>> On 2021/1/30 上午2:48, Elena Afanasova wrote:
>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>> index ca41220b40b8..81e775778c66 100644
>>> --- a/include/uapi/linux/kvm.h
>>> +++ b/include/uapi/linux/kvm.h
>>> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>>>    	__u8  pad[36];
>>>    };
>>> +enum {
>>> +	kvm_ioregion_flag_nr_pio,
>>> +	kvm_ioregion_flag_nr_posted_writes,
>>> +	kvm_ioregion_flag_nr_max,
>>> +};
>>> +
>>> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
>>> +#define KVM_IOREGION_POSTED_WRITES (1 << kvm_ioregion_flag_nr_posted_writes)
>>> +
>>> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << kvm_ioregion_flag_nr_max) - 1)
>>> +
>>> +struct kvm_ioregion {
>>> +	__u64 guest_paddr; /* guest physical address */
>>> +	__u64 memory_size; /* bytes */
>>
>> Do we really need __u64 here?
> I think 64-bit PCI BARs can be >4 GB. There is plenty of space in this
> struct to support a 64-bit field.
>
> That said, userspace could also add more ioregions if it needs to cover
> more than 4 GB. That would slow down ioregion lookups though since the
> in-kernel data structure would become larger.
>
> Making it 64-bit seems more future-proof and cleaner than having to work
> around the limitation using multiple ioregions. Did you have a
> particular reason in mind why this field should not be 64 bits?


Nope. Just wonder what's the use case for that.

Thanks



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-10 19:31     ` Elena Afanasova
  2021-02-11 14:59       ` Stefan Hajnoczi
@ 2021-02-18  6:20       ` Jason Wang
  1 sibling, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-18  6:20 UTC (permalink / raw)
  To: Elena Afanasova, kvm; +Cc: stefanha, jag.raman, elena.ufimtseva


On 2021/2/11 上午3:31, Elena Afanasova wrote:
>>> +	}
>> I wonder how much value if we stick a check like this here (if our
>> code
>> can gracefully deal with blocking fd).
>>
> Do you think it would be better to remove this check and just mention
> that in a comment or documentation?
>

Yes.

Thanks


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION
  2021-02-11 14:59       ` Stefan Hajnoczi
  2021-02-17 23:05         ` Elena Afanasova
@ 2021-02-18  6:22         ` Jason Wang
  1 sibling, 0 replies; 28+ messages in thread
From: Jason Wang @ 2021-02-18  6:22 UTC (permalink / raw)
  To: Stefan Hajnoczi, Elena Afanasova; +Cc: kvm, jag.raman, elena.ufimtseva


On 2021/2/11 下午10:59, Stefan Hajnoczi wrote:
> On Wed, Feb 10, 2021 at 11:31:30AM -0800, Elena Afanasova wrote:
>> On Mon, 2021-02-08 at 14:21 +0800, Jason Wang wrote:
>>> On 2021/1/30 上午2:48, Elena Afanasova wrote:
>>>> This vm ioctl adds or removes an ioregionfd MMIO/PIO region. Guest
>>>> read and write accesses are dispatched through the given ioregionfd
>>>> instead of returning from ioctl(KVM_RUN). Regions can be deleted by
>>>> setting fds to -1.
>>>>
>>>> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
>>>> ---
>>>> Changes in v2:
>>>>     - changes after code review
>>>>
>>>>    arch/x86/kvm/Kconfig     |   1 +
>>>>    arch/x86/kvm/Makefile    |   1 +
>>>>    arch/x86/kvm/x86.c       |   1 +
>>>>    include/linux/kvm_host.h |  17 +++
>>>>    include/uapi/linux/kvm.h |  23 ++++
>>>>    virt/kvm/Kconfig         |   3 +
>>>>    virt/kvm/eventfd.c       |  25 +++++
>>>>    virt/kvm/eventfd.h       |  14 +++
>>>>    virt/kvm/ioregion.c      | 232
>>>> +++++++++++++++++++++++++++++++++++++++
>>>>    virt/kvm/ioregion.h      |  15 +++
>>>>    virt/kvm/kvm_main.c      |  11 ++
>>>>    11 files changed, 343 insertions(+)
>>>>    create mode 100644 virt/kvm/eventfd.h
>>>>    create mode 100644 virt/kvm/ioregion.c
>>>>    create mode 100644 virt/kvm/ioregion.h
>>>>
>>>> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
>>>> index f92dfd8ef10d..b914ef375199 100644
>>>> --- a/arch/x86/kvm/Kconfig
>>>> +++ b/arch/x86/kvm/Kconfig
>>>> @@ -33,6 +33,7 @@ config KVM
>>>>    	select HAVE_KVM_IRQ_BYPASS
>>>>    	select HAVE_KVM_IRQ_ROUTING
>>>>    	select HAVE_KVM_EVENTFD
>>>> +	select KVM_IOREGION
>>>>    	select KVM_ASYNC_PF
>>>>    	select USER_RETURN_NOTIFIER
>>>>    	select KVM_MMIO
>>>> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
>>>> index b804444e16d4..b3b17dc9f7d4 100644
>>>> --- a/arch/x86/kvm/Makefile
>>>> +++ b/arch/x86/kvm/Makefile
>>>> @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm
>>>>    kvm-y			+= $(KVM)/kvm_main.o
>>>> $(KVM)/coalesced_mmio.o \
>>>>    				$(KVM)/eventfd.o $(KVM)/irqchip.o
>>>> $(KVM)/vfio.o
>>>>    kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>>>> +kvm-$(CONFIG_KVM_IOREGION)	+= $(KVM)/ioregion.o
>>>>    
>>>>    kvm-y			+= x86.o emulate.o i8259.o irq.o
>>>> lapic.o \
>>>>    			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
>>>> mtrr.o \
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index e545a8a613b1..ddb28f5ca252 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
>>>> *kvm, long ext)
>>>>    	case KVM_CAP_X86_USER_SPACE_MSR:
>>>>    	case KVM_CAP_X86_MSR_FILTER:
>>>>    	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
>>>> +	case KVM_CAP_IOREGIONFD:
>>>>    		r = 1;
>>>>    		break;
>>>>    	case KVM_CAP_SYNC_REGS:
>>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>>> index 7f2e2a09ebbd..7cd667dddba9 100644
>>>> --- a/include/linux/kvm_host.h
>>>> +++ b/include/linux/kvm_host.h
>>>> @@ -470,6 +470,10 @@ struct kvm {
>>>>    		struct mutex      resampler_lock;
>>>>    	} irqfds;
>>>>    	struct list_head ioeventfds;
>>>> +#endif
>>>> +#ifdef CONFIG_KVM_IOREGION
>>>> +	struct list_head ioregions_mmio;
>>>> +	struct list_head ioregions_pio;
>>>>    #endif
>>>>    	struct kvm_vm_stat stat;
>>>>    	struct kvm_arch arch;
>>>> @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct kvm
>>>> *kvm, struct kvm_ioeventfd *args)
>>>>    
>>>>    #endif /* CONFIG_HAVE_KVM_EVENTFD */
>>>>    
>>>> +#ifdef CONFIG_KVM_IOREGION
>>>> +void kvm_ioregionfd_init(struct kvm *kvm);
>>>> +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion *args);
>>>> +
>>>> +#else
>>>> +
>>>> +static inline void kvm_ioregionfd_init(struct kvm *kvm) {}
>>>> +static inline int kvm_ioregionfd(struct kvm *kvm, struct
>>>> kvm_ioregion *args)
>>>> +{
>>>> +	return -ENOSYS;
>>>> +}
>>>> +#endif
>>>> +
>>>>    void kvm_arch_irq_routing_update(struct kvm *kvm);
>>>>    
>>>>    static inline void kvm_make_request(int req, struct kvm_vcpu
>>>> *vcpu)
>>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>>> index ca41220b40b8..81e775778c66 100644
>>>> --- a/include/uapi/linux/kvm.h
>>>> +++ b/include/uapi/linux/kvm.h
>>>> @@ -732,6 +732,27 @@ struct kvm_ioeventfd {
>>>>    	__u8  pad[36];
>>>>    };
>>>>    
>>>> +enum {
>>>> +	kvm_ioregion_flag_nr_pio,
>>>> +	kvm_ioregion_flag_nr_posted_writes,
>>>> +	kvm_ioregion_flag_nr_max,
>>>> +};
>>>> +
>>>> +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio)
>>>> +#define KVM_IOREGION_POSTED_WRITES (1 <<
>>>> kvm_ioregion_flag_nr_posted_writes)
>>>> +
>>>> +#define KVM_IOREGION_VALID_FLAG_MASK ((1 <<
>>>> kvm_ioregion_flag_nr_max) - 1)
>>>> +
>>>> +struct kvm_ioregion {
>>>> +	__u64 guest_paddr; /* guest physical address */
>>>> +	__u64 memory_size; /* bytes */
>>> Do we really need __u64 here?
>>>
>>>
>>>> +	__u64 user_data;
>>>> +	__s32 rfd;
>>>> +	__s32 wfd;
>>>> +	__u32 flags;
>>>> +	__u8  pad[28];
>>>> +};
>>>> +
>>>>    #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
>>>>    #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>>>>    #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
>>>> @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt {
>>>>    #define KVM_CAP_X86_USER_SPACE_MSR 188
>>>>    #define KVM_CAP_X86_MSR_FILTER 189
>>>>    #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
>>>> +#define KVM_CAP_IOREGIONFD 191
>>>>    
>>>>    #ifdef KVM_CAP_IRQ_ROUTING
>>>>    
>>>> @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce {
>>>>    					struct
>>>> kvm_userspace_memory_region)
>>>>    #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
>>>>    #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
>>>> +#define KVM_SET_IOREGION          _IOW(KVMIO,  0x49, struct
>>>> kvm_ioregion)
>>>>    
>>>>    /* enable ucontrol for s390 */
>>>>    struct kvm_s390_ucas_mapping {
>>>> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
>>>> index 1c37ccd5d402..5e6620bbf000 100644
>>>> --- a/virt/kvm/Kconfig
>>>> +++ b/virt/kvm/Kconfig
>>>> @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD
>>>>           bool
>>>>           select EVENTFD
>>>>    
>>>> +config KVM_IOREGION
>>>> +       bool
>>>> +
>>>>    config KVM_MMIO
>>>>           bool
>>>>    
>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>> index c2323c27a28b..aadb73903f8b 100644
>>>> --- a/virt/kvm/eventfd.c
>>>> +++ b/virt/kvm/eventfd.c
>>>> @@ -27,6 +27,7 @@
>>>>    #include <trace/events/kvm.h>
>>>>    
>>>>    #include <kvm/iodev.h>
>>>> +#include "ioregion.h"
>>>>    
>>>>    #ifdef CONFIG_HAVE_KVM_IRQFD
>>>>    
>>>> @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops
>>>> ioeventfd_ops = {
>>>>    	.destructor = ioeventfd_destructor,
>>>>    };
>>>>    
>>>> +#ifdef CONFIG_KVM_IOREGION
>>>> +/* assumes kvm->slots_lock held */
>>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx,
>>>> +			  u64 start, u64 size)
>>>> +{
>>>> +	struct _ioeventfd *_p;
>>>> +
>>>> +	list_for_each_entry(_p, &kvm->ioeventfds, list)
>>>> +		if (_p->bus_idx == bus_idx &&
>>>> +		    overlap(start, size, _p->addr,
>>>> +			    !_p->length ? 8 : _p->length))
>>>> +			return true;
>>>> +
>>>> +	return false;
>>>> +}
>>>> +#endif
>>>> +
>>>>    /* assumes kvm->slots_lock held */
>>>>    static bool
>>>>    ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>>>> @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm,
>>>> struct _ioeventfd *p)
>>>>    		       _p->datamatch == p->datamatch))))
>>>>    			return true;
>>>>    
>>>> +#ifdef CONFIG_KVM_IOREGION
>>>> +	if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == KVM_PIO_BUS)
>>>> +		if (kvm_ioregion_collides(kvm, p->bus_idx, p->addr,
>>>> +					  !p->length ? 8 : p->length))
>>>> +			return true;
>>>> +#endif
>>>> +
>>>>    	return false;
>>>>    }
>>>>    
>>>> diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h
>>>> new file mode 100644
>>>> index 000000000000..73a621eebae3
>>>> --- /dev/null
>>>> +++ b/virt/kvm/eventfd.h
>>>> @@ -0,0 +1,14 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +#ifndef __KVM_EVENTFD_H__
>>>> +#define __KVM_EVENTFD_H__
>>>> +
>>>> +#ifdef CONFIG_KVM_IOREGION
>>>> +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start,
>>>> u64 size);
>>>> +#else
>>>> +static inline bool
>>>> +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, u64
>>>> size)
>>>> +{
>>>> +	return false;
>>>> +}
>>>> +#endif
>>>> +#endif
>>>> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
>>>> new file mode 100644
>>>> index 000000000000..48ff92bca966
>>>> --- /dev/null
>>>> +++ b/virt/kvm/ioregion.c
>>>> @@ -0,0 +1,232 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +#include <linux/kvm_host.h>
>>>> +#include <linux/fs.h>
>>>> +#include <kvm/iodev.h>
>>>> +#include "eventfd.h"
>>>> +
>>>> +void
>>>> +kvm_ioregionfd_init(struct kvm *kvm)
>>>> +{
>>>> +	INIT_LIST_HEAD(&kvm->ioregions_mmio);
>>>> +	INIT_LIST_HEAD(&kvm->ioregions_pio);
>>>> +}
>>>> +
>>>> +struct ioregion {
>>>> +	struct list_head     list;
>>>> +	u64                  paddr;  /* guest physical address */
>>>> +	u64                  size;   /* size in bytes */
>>>> +	struct file         *rf;
>>>> +	struct file         *wf;
>>>> +	u64                  user_data; /* opaque token used by
>>>> userspace */
>>>> +	struct kvm_io_device dev;
>>>> +	bool                 posted_writes;
>>>> +};
>>>> +
>>>> +static inline struct ioregion *
>>>> +to_ioregion(struct kvm_io_device *dev)
>>>> +{
>>>> +	return container_of(dev, struct ioregion, dev);
>>>> +}
>>>> +
>>>> +/* assumes kvm->slots_lock held */
>>>> +static void
>>>> +ioregion_release(struct ioregion *p)
>>>> +{
>>>> +	fput(p->rf);
>>>> +	fput(p->wf);
>>>> +	list_del(&p->list);
>>>> +	kfree(p);
>>>> +}
>>>> +
>>>> +static int
>>>> +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
>>>> gpa_t addr,
>>>> +	      int len, void *val)
>>>> +{
>>>> +	return -EOPNOTSUPP;
>>>> +}
>>>> +
>>>> +static int
>>>> +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
>>>> gpa_t addr,
>>>> +		int len, const void *val)
>>>> +{
>>>> +	return -EOPNOTSUPP;
>>>> +}
>>>> +
>>>> +/*
>>>> + * This function is called as KVM is completely shutting down.  We
>>>> do not
>>>> + * need to worry about locking just nuke anything we have as
>>>> quickly as possible
>>>> + */
>>>> +static void
>>>> +ioregion_destructor(struct kvm_io_device *this)
>>>> +{
>>>> +	struct ioregion *p = to_ioregion(this);
>>>> +
>>>> +	ioregion_release(p);
>>>> +}
>>>> +
>>>> +static const struct kvm_io_device_ops ioregion_ops = {
>>>> +	.read       = ioregion_read,
>>>> +	.write      = ioregion_write,
>>>> +	.destructor = ioregion_destructor,
>>>> +};
>>>> +
>>>> +static inline struct list_head *
>>>> +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx)
>>>> +{
>>>> +	return (bus_idx == KVM_MMIO_BUS) ?
>>>> +		&kvm->ioregions_mmio : &kvm->ioregions_pio;
>>>> +}
>>>> +
>>>> +/* check for not overlapping case and reverse */
>>>> +inline bool
>>>> +overlap(u64 start1, u64 size1, u64 start2, u64 size2)
>>>> +{
>>>> +	u64 end1 = start1 + size1 - 1;
>>>> +	u64 end2 = start2 + size2 - 1;
>>>> +
>>>> +	return !(end1 < start2 || start1 >= end2);
>>>> +}
>>>> +
>>>> +/* assumes kvm->slots_lock held */
>>>> +bool
>>>> +kvm_ioregion_collides(struct kvm *kvm, int bus_idx,
>>>> +		      u64 start, u64 size)
>>>> +{
>>>> +	struct ioregion *_p;
>>>> +	struct list_head *ioregions;
>>>> +
>>>> +	ioregions = get_ioregion_list(kvm, bus_idx);
>>>> +	list_for_each_entry(_p, ioregions, list)
>>>> +		if (overlap(start, size, _p->paddr, _p->size))
>>>> +			return true;
>>>> +
>>>> +	return false;
>>>> +}
>>>> +
>>>> +/* assumes kvm->slots_lock held */
>>>> +static bool
>>>> +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum
>>>> kvm_bus bus_idx)
>>>> +{
>>>> +	if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p->size) ||
>>>> +	    kvm_eventfd_collides(kvm, bus_idx, p->paddr, p->size))
>>>> +		return true;
>>>> +
>>>> +	return false;
>>>> +}
>>>> +
>>>> +static enum kvm_bus
>>>> +get_bus_from_flags(__u32 flags)
>>>> +{
>>>> +	if (flags & KVM_IOREGION_PIO)
>>>> +		return KVM_PIO_BUS;
>>>> +	return KVM_MMIO_BUS;
>>>> +}
>>>> +
>>>> +int
>>>> +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
>>>> +{
>>>> +	struct ioregion *p;
>>>> +	struct file *rfile, *wfile;
>>>> +	enum kvm_bus bus_idx;
>>>> +	int ret = 0;
>>>> +
>>>> +	if (!args->memory_size)
>>>> +		return -EINVAL;
>>>> +	if ((args->guest_paddr + args->memory_size - 1) < args-
>>>>> guest_paddr)
>>>> +		return -EINVAL;
>>>> +
>>>> +	rfile = fget(args->rfd);
>>>> +	if (!rfile)
>>>> +		return -EBADF;
>>> So the question still, if we want to use ioregion fd for doorbell,
>>> we
>>> don't need rfd in this case?
>>>
>> Using ioregionfd for doorbell seems to be an open question. Probably it
>> could just focus on the non-doorbell cases.
> Below you replied FAST_MMIO will be in v3. That is the doorbell case, so
> maybe it is in scope for this patch series?
>
> I think continuing to use ioeventfd for most doorbell registers makes
> sense.
>
> However, there are two cases where ioregionfd doorbell support is
> interesting:
>
> 1. The (non-FAST_MMIO) case where the application needs to know the
>     value written to the doorbell. ioeventfd cannot do this (datamatch
>     can handle a subset of cases but not all) so we need ioregionfd for
>     this.
>
> 2. The FAST_MMIO case just for convenience if applications prefer to use
>     a single API (ioregionfd) instead of implementing both ioregionfd and
>     ioeventfd.


Yes.


>
> ioeventfd will still have its benefits (and limitations) that make it
> different from ioregionfd. In particular, ioregionfd will not merge
> doorbell writes into a single message because doing so would basically
> involve reimplementing ioeventfd functionality as part of ioregionfd and
> isn't compatible with the current approach where userspace can provide
> any file descriptor for communication.
>
> Elena and Jason: do you agree with this API design?


I agree.

Thanks



^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-02-18  6:27 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28 18:32 [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Elena Afanasova
2021-01-28 18:32 ` [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling Elena Afanasova
2021-01-30 16:58   ` Stefan Hajnoczi
2021-02-03 14:00     ` Elena Afanasova
2021-02-09  6:21   ` Jason Wang
2021-02-09 14:49     ` Stefan Hajnoczi
2021-02-10 19:06     ` Elena Afanasova
2021-02-09  6:26   ` Jason Wang
2021-01-28 18:32 ` [RFC v2 3/4] KVM: add support for ioregionfd cmds/replies serialization Elena Afanasova
2021-01-30 18:54   ` Stefan Hajnoczi
2021-02-03 14:10     ` Elena Afanasova
2021-01-28 18:32 ` [RFC v2 4/4] KVM: enforce NR_IOBUS_DEVS limit if kmemcg is disabled Elena Afanasova
2021-01-29 18:48 ` [RESEND RFC v2 1/4] KVM: add initial support for KVM_SET_IOREGION Elena Afanasova
2021-01-30 15:04   ` Stefan Hajnoczi
2021-02-04 13:03   ` Cornelia Huck
2021-02-05 18:39     ` Elena Afanasova
2021-02-08 11:49       ` Cornelia Huck
2021-02-08  6:21   ` Jason Wang
2021-02-09 14:59     ` Stefan Hajnoczi
2021-02-18  6:17       ` Jason Wang
2021-02-10 19:31     ` Elena Afanasova
2021-02-11 14:59       ` Stefan Hajnoczi
2021-02-17 23:05         ` Elena Afanasova
2021-02-18  6:22         ` Jason Wang
2021-02-18  6:20       ` Jason Wang
2021-01-30 14:56 ` [RFC v2 0/4] Introduce MMIO/PIO dispatch file descriptors (ioregionfd) Stefan Hajnoczi
2021-02-02 14:59 ` Stefan Hajnoczi
2021-02-08  6:02 ` Jason Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.