Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling

From: Jason Wang <jasowang@redhat.com>
To: Elena Afanasova <eafanasova@gmail.com>, kvm@vger.kernel.org
Cc: stefanha@redhat.com, jag.raman@oracle.com, elena.ufimtseva@oracle.com
Subject: Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling
Date: Tue, 9 Feb 2021 14:21:22 +0800	[thread overview]
Message-ID: <909c71c9-e83e-e6d8-0a33-92dac5b0b5c6@redhat.com> (raw)
In-Reply-To: <aa049c6e5bade3565c5ffa820bbbb67bd5d1bf4b.1611850291.git.eafanasova@gmail.com>

On 2021/1/29 上午2:32, Elena Afanasova wrote:
> The vCPU thread may receive a signal during ioregionfd communication,
> ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> must resume ioregionfd.

It looks to me the patch contains much more than just signal handling 
(e.g the protocol). Please split.

>
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
> Changes in v2:
>    - add support for x86 signal handling
>    - changes after code review
>
>   arch/x86/kvm/x86.c            | 196 +++++++++++++++++++++++++++++++---
>   include/linux/kvm_host.h      |  13 +++
>   include/uapi/linux/ioregion.h |  32 ++++++
>   virt/kvm/ioregion.c           | 177 +++++++++++++++++++++++++++++-
>   virt/kvm/kvm_main.c           |  16 ++-
>   5 files changed, 415 insertions(+), 19 deletions(-)
>   create mode 100644 include/uapi/linux/ioregion.h

I wonder whether it's better to split into two patches:

1) general signal support for KVM I/O device
2) the ioregionfd part

>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ddb28f5ca252..a04516b531da 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>   {
>   	int handled = 0;
>   	int n;
> +	int ret = 0;
> +	bool is_apic;
>   
>   	do {
>   		n = min(len, 8);
> -		if (!(lapic_in_kernel(vcpu) &&
> -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
> -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
> -			break;
> +		is_apic = lapic_in_kernel(vcpu) &&
> +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> +					      addr, n, v);

A better name is needed since "is_apic" only covers the first condition.

> +		if (!is_apic) {
> +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> +					       addr, n, v);
> +			if (ret)
> +				break;
> +		}
>   		handled += n;
>   		addr += n;
>   		len -= n;
>   		v += n;
>   	} while (len);
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;

My understanding is that, we should check ERESTARTSYS instead of EINTR. 
EINTR means the syscall can't be restarted.

E.g we had the following errno for sockets:

/* Alas, with timeout socket operations are not restartable.
  * Compare this to poll().
  */
static inline int sock_intr_errno(long timeo)
{
     return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

For the case of EINTR, do we need a fallback to vcpu userspace process 
(Qemu)?

And we probably need a trace point here.

> +	}
> +#endif
> +
>   	return handled;
>   }
>   
> @@ -5819,14 +5833,20 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
>   {
>   	int handled = 0;
>   	int n;
> +	int ret = 0;
> +	bool is_apic;
>   
>   	do {
>   		n = min(len, 8);
> -		if (!(lapic_in_kernel(vcpu) &&
> -		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> -					 addr, n, v))
> -		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
> -			break;
> +		is_apic = lapic_in_kernel(vcpu) &&
> +			  !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> +					     addr, n, v);
> +		if (!is_apic) {
> +			ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS,
> +					      addr, n, v);
> +			if (ret)
> +				break;
> +		}
>   		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
>   		handled += n;
>   		addr += n;
> @@ -5834,6 +5854,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
>   		v += n;
>   	} while (len);
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +	}
> +#endif
> +
>   	return handled;
>   }
>   
> @@ -6294,6 +6321,12 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
>   	vcpu->mmio_needed = 1;
>   	vcpu->mmio_cur_fragment = 0;
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted &&
> +	    vcpu->run->exit_reason == KVM_EXIT_INTR)
> +		return (vcpu->ioregion_ctx.in) ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
> +#endif
> +
>   	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
>   	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
>   	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> @@ -6411,16 +6444,23 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
>   
>   	for (i = 0; i < vcpu->arch.pio.count; i++) {
>   		if (vcpu->arch.pio.in)
> -			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
> +			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
> +					    vcpu->arch.pio.port,
>   					    vcpu->arch.pio.size, pd);
>   		else
>   			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
> -					     vcpu->arch.pio.port, vcpu->arch.pio.size,
> -					     pd);
> +					     vcpu->arch.pio.port,
> +					     vcpu->arch.pio.size, pd);
>   		if (r)
>   			break;
>   		pd += vcpu->arch.pio.size;
>   	}
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted && r == -EINTR) {
> +		vcpu->ioregion_ctx.pio = i;
> +	}
> +#endif
> +
>   	return r;
>   }
>   
> @@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
>   			       unsigned short port, void *val,
>   			       unsigned int count, bool in)
>   {
> +	int ret = 0;
> +
>   	vcpu->arch.pio.port = port;
>   	vcpu->arch.pio.in = in;
>   	vcpu->arch.pio.count  = count;
>   	vcpu->arch.pio.size = size;
>   
> -	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
> +	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
> +	if (!ret) {

Unnecessary changes.

>   		vcpu->arch.pio.count = 0;
>   		return 1;
>   	}
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (ret == -EINTR) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +		return 0;
> +	}
> +#endif
> +
>   	vcpu->run->exit_reason = KVM_EXIT_IO;
>   	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
>   	vcpu->run->io.size = size;
> @@ -7141,6 +7192,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
>   
>   static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
>   static int complete_emulated_pio(struct kvm_vcpu *vcpu);
> +#ifdef CONFIG_KVM_IOREGION
> +static int complete_ioregion_io(struct kvm_vcpu *vcpu);
> +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu);
> +#endif
>   
>   static void kvm_smm_changed(struct kvm_vcpu *vcpu)
>   {
> @@ -7405,6 +7460,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   		r = 1;
>   		if (inject_emulated_exception(vcpu))
>   			return r;
> +#ifdef CONFIG_KVM_IOREGION
> +	} else if (vcpu->ioregion_interrupted &&
> +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		if (vcpu->ioregion_ctx.in)
> +			writeback = false;
> +		vcpu->arch.complete_userspace_io = complete_ioregion_io;
> +		r = 0;
> +#endif
>   	} else if (vcpu->arch.pio.count) {
>   		if (!vcpu->arch.pio.in) {
>   			/* FIXME: return into emulator if single-stepping.  */
> @@ -7501,6 +7564,11 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
>   		vcpu->arch.complete_userspace_io =
>   			complete_fast_pio_out_port_0x7e;
>   		kvm_skip_emulated_instruction(vcpu);
> +#ifdef CONFIG_KVM_IOREGION
> +	} else if (vcpu->ioregion_interrupted &&
> +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
> +#endif
>   	} else {
>   		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
>   		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
> @@ -7548,6 +7616,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
>   		return ret;
>   	}
>   
> +#ifdef CONFIG_KVM_IOREGION
> +	if (vcpu->ioregion_interrupted &&
> +	    vcpu->run->exit_reason == KVM_EXIT_INTR) {
> +		vcpu->arch.complete_userspace_io = complete_ioregion_fast_pio;
> +		return 0;
> +	}
> +#endif
>   	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
>   	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
>   
> @@ -9204,6 +9279,101 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +#ifdef CONFIG_KVM_IOREGION
> +static void complete_ioregion_access(struct kvm_vcpu *vcpu, gpa_t addr,
> +				     int len, void *val)
> +{
> +	if (vcpu->ioregion_ctx.in)
> +		vcpu->ioregion_ctx.dev->ops->read(vcpu, vcpu->ioregion_ctx.dev,
> +						  addr, len, val);
> +	else
> +		vcpu->ioregion_ctx.dev->ops->write(vcpu, vcpu->ioregion_ctx.dev,
> +						   addr, len, val);

Two dumb questions:

1) So if the write is interrupted by the signal, we may do twice or more 
write. Can this satisfies the semantics of all type of registers? E.g 
for the hardware that counts the time of write to a specific register etc.
2) If the answer is yes, can we simply rewind RIP to re-emulate the 
instruction?

> +}
> +
> +static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmio_fragment *frag;
> +	int idx, ret, i, n;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	for (i = vcpu->mmio_cur_fragment; i < vcpu->mmio_nr_fragments; i++) {
> +		frag = &vcpu->mmio_fragments[i];
> +		do {
> +			n = min(8u, frag->len);
> +			complete_ioregion_access(vcpu, frag->gpa, n, frag->data);
> +			frag->len -= n;
> +			frag->data += n;
> +			frag->gpa += n;
> +		} while (frag->len);
> +		vcpu->mmio_cur_fragment++;
> +	}
> +
> +	vcpu->mmio_needed = 0;
> +	if (!vcpu->ioregion_ctx.in) {
> +		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +		return 1;
> +	}
> +
> +	vcpu->mmio_read_completed = 1;
> +	ret = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	return ret;
> +}
> +
> +static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
> +{
> +	int i, idx, r = 1;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	for (i = vcpu->ioregion_ctx.pio; i < vcpu->arch.pio.count; i++) {
> +		complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> +					 vcpu->ioregion_ctx.len,
> +					 vcpu->ioregion_ctx.val);
> +		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> +	}
> +
> +	if (vcpu->ioregion_ctx.in)
> +		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	vcpu->arch.pio.count = 0;
> +
> +	return r;
> +}
> +
> +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
> +{
> +	int idx;
> +	u64 val;
> +
> +	BUG_ON(!vcpu->ioregion_interrupted);
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> +				 vcpu->ioregion_ctx.len,
> +				 vcpu->ioregion_ctx.val);
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (vcpu->ioregion_ctx.in) {
> +		memcpy(&val, vcpu->ioregion_ctx.val, vcpu->ioregion_ctx.len);
> +		kvm_rax_write(vcpu, val);
> +	}
> +	vcpu->arch.pio.count = 0;
> +
> +	return kvm_skip_emulated_instruction(vcpu);
> +}
> +
> +static int complete_ioregion_io(struct kvm_vcpu *vcpu)
> +{
> +	BUG_ON(!vcpu->ioregion_interrupted);
> +
> +	if (vcpu->mmio_needed)
> +		return complete_ioregion_mmio(vcpu);
> +	if (vcpu->arch.pio.count)
> +		return complete_ioregion_pio(vcpu);
> +}
> +#endif
> +
>   static void kvm_save_current_fpu(struct fpu *fpu)
>   {
>   	/*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7cd667dddba9..5cfdecfca6db 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -318,6 +318,19 @@ struct kvm_vcpu {
>   #endif
>   	bool preempted;
>   	bool ready;
> +#ifdef CONFIG_KVM_IOREGION
> +	bool ioregion_interrupted;
> +	struct {
> +		struct kvm_io_device *dev;
> +		int pio;
> +		void *val;
> +		u8 state;

Let's document the state machine here.

> +		u64 addr;
> +		int len;
> +		u64 data;
> +		bool in;
> +	} ioregion_ctx;
> +#endif
>   	struct kvm_vcpu_arch arch;
>   };
>   
> diff --git a/include/uapi/linux/ioregion.h b/include/uapi/linux/ioregion.h
> new file mode 100644
> index 000000000000..7898c01f84a1
> --- /dev/null
> +++ b/include/uapi/linux/ioregion.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_IOREGION_H
> +#define _UAPI_LINUX_IOREGION_H
> +
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +
> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};
> +
> +#define IOREGIONFD_CMD_READ    0
> +#define IOREGIONFD_CMD_WRITE   1
> +
> +#define IOREGIONFD_SIZE_8BIT   0
> +#define IOREGIONFD_SIZE_16BIT  1
> +#define IOREGIONFD_SIZE_32BIT  2
> +#define IOREGIONFD_SIZE_64BIT  3
> +
> +#define IOREGIONFD_SIZE_OFFSET 4
> +#define IOREGIONFD_RESP_OFFSET 6
> +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)

Instead of using macros, why not explicitly define them in struct 
ioregionfd_cmd instead of using info?

> +
> +#endif
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index 48ff92bca966..da38124e1418 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -3,6 +3,7 @@
>   #include <linux/fs.h>
>   #include <kvm/iodev.h>
>   #include "eventfd.h"
> +#include <uapi/linux/ioregion.h>
>   
>   void
>   kvm_ioregionfd_init(struct kvm *kvm)
> @@ -38,18 +39,190 @@ ioregion_release(struct ioregion *p)
>   	kfree(p);
>   }
>   
> +static bool
> +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, int resp,
> +	 u64 user_data, const void *val)
> +{
> +	u64 size = 0;
> +
> +	switch (len) {
> +	case 1:
> +		size = IOREGIONFD_SIZE_8BIT;
> +		break;
> +	case 2:
> +		size = IOREGIONFD_SIZE_16BIT;
> +		break;
> +	case 4:
> +		size = IOREGIONFD_SIZE_32BIT;
> +		break;
> +	case 8:
> +		size = IOREGIONFD_SIZE_64BIT;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	if (val)
> +		memcpy(&cmd->data, val, len);
> +	cmd->user_data = user_data;
> +	cmd->offset = offset;
> +	cmd->info |= opt;
> +	cmd->info |= IOREGIONFD_SIZE(size);
> +	cmd->info |= IOREGIONFD_RESP(resp);
> +
> +	return true;
> +}
> +
> +enum {
> +	SEND_CMD,
> +	GET_REPLY,
> +	COMPLETE
> +};
> +
> +static void
> +ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> +		  bool in, gpa_t addr, int len, u64 data, u8 state, void *val)
> +{
> +	vcpu->ioregion_interrupted = true;
> +
> +	vcpu->ioregion_ctx.dev = this;
> +	vcpu->ioregion_ctx.val = val;
> +	vcpu->ioregion_ctx.state = state;
> +	vcpu->ioregion_ctx.addr = addr;
> +	vcpu->ioregion_ctx.len = len;
> +	vcpu->ioregion_ctx.data = data;
> +	vcpu->ioregion_ctx.in = in;
> +}
> +
>   static int
>   ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>   	      int len, void *val)
>   {
> -	return -EOPNOTSUPP;
> +	struct ioregion *p = to_ioregion(this);
> +	union {
> +		struct ioregionfd_cmd cmd;
> +		struct ioregionfd_resp resp;
> +	} buf;
> +	int ret = 0;
> +	int state = 0;

Let's use SEND_CMD otherwise it would be hard for the reviewers...

> +
> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	if (unlikely(vcpu->ioregion_interrupted)) {
> +		vcpu->ioregion_interrupted = false;
> +
> +		switch (vcpu->ioregion_ctx.state) {
> +		case SEND_CMD:
> +			goto send_cmd;
> +		case GET_REPLY:
> +			goto get_repl;
> +		case COMPLETE:

I fail to understand under what condition can we reach here?

> +			memcpy(val, &vcpu->ioregion_ctx.data, len);
> +			return 0;
> +		}
> +	}
> +
> +send_cmd:
> +	memset(&buf, 0, sizeof(buf));
> +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
> +		      1, p->user_data, NULL))
> +		return -EOPNOTSUPP;
> +
> +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> +	state = (ret == sizeof(buf.cmd));
> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 1, addr, len, 0, state, val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.cmd)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +get_repl:
> +	memset(&buf, 0, sizeof(buf));
> +	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> +	state += (ret == sizeof(buf.resp));

Let's use enum instead of doing tricks like this.

Thanks

> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 1, addr, len, buf.resp.data, state, val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.resp)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +	memcpy(val, &buf.resp.data, len);
> +
> +	return 0;
>   }
>   
>   static int
>   ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>   		int len, const void *val)
>   {
> -	return -EOPNOTSUPP;
> +	struct ioregion *p = to_ioregion(this);
> +	union {
> +		struct ioregionfd_cmd cmd;
> +		struct ioregionfd_resp resp;
> +	} buf;
> +	int ret = 0;
> +	int state = 0;
> +
> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	if (unlikely(vcpu->ioregion_interrupted)) {
> +		vcpu->ioregion_interrupted = false;
> +
> +		switch (vcpu->ioregion_ctx.state) {
> +		case SEND_CMD:
> +			goto send_cmd;
> +		case GET_REPLY:
> +			if (!p->posted_writes)
> +				goto get_repl;
> +			fallthrough;
> +		case COMPLETE:
> +			return 0;
> +		}
> +	}
> +
> +send_cmd:
> +	memset(&buf, 0, sizeof(buf));
> +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE,
> +		      p->posted_writes ? 0 : 1, p->user_data, val))
> +		return -EOPNOTSUPP;
> +
> +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> +	state = (ret == sizeof(buf.cmd));
> +	if (signal_pending(current)) {
> +		ioregion_save_ctx(vcpu, this, 0, addr, len,
> +				  0, state, (void *)val);
> +		return -EINTR;
> +	}
> +	if (ret != sizeof(buf.cmd)) {
> +		ret = (ret < 0) ? ret : -EIO;
> +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +	}
> +
> +get_repl:
> +	if (!p->posted_writes) {
> +		memset(&buf, 0, sizeof(buf));
> +		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> +		state += (ret == sizeof(buf.resp));
> +		if (signal_pending(current)) {
> +			ioregion_save_ctx(vcpu, this, 0, addr, len,
> +					  0, state, (void *)val);
> +			return -EINTR;
> +		}
> +		if (ret != sizeof(buf.resp)) {
> +			ret = (ret < 0) ? ret : -EIO;
> +			return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
> +		}
> +	}
> +
> +	return 0;
>   }
>   
>   /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 88b92fc3da51..df387857f51f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   			      struct kvm_io_range *range, const void *val)
>   {
>   	int idx;
> +	int ret = 0;
>   
>   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
>   	if (idx < 0)
> @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   
>   	while (idx < bus->dev_count &&
>   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> -					range->len, val))
> +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
> +					 range->len, val);
> +		if (!ret)
>   			return idx;
> +		if (ret < 0 && ret != -EOPNOTSUPP)
> +			return ret;
>   		idx++;
>   	}
>   
> @@ -4264,6 +4268,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   			     struct kvm_io_range *range, void *val)
>   {
>   	int idx;
> +	int ret = 0;
>   
>   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
>   	if (idx < 0)
> @@ -4271,9 +4276,12 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>   
>   	while (idx < bus->dev_count &&
>   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> -		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
> -				       range->len, val))
> +		ret = kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
> +					range->len, val);
> +		if (!ret)
>   			return idx;
> +		if (ret < 0 && ret != -EOPNOTSUPP)
> +			return ret;
>   		idx++;
>   	}
>