All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] kvm: Extend irqfd to support level interrupts
@ 2012-06-16 16:34 Alex Williamson
  2012-06-17 11:17 ` Jan Kiszka
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-16 16:34 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, mst, jan.kiszka

I'm looking for opinions on this approach.  For vfio device assignment
we minimally need a way to get EOIs from the in-kernel irqchip out to
userspace.  Getting that out via an eventfd would allow us to bounce
all level interrupts out to userspace, where we would de-assert the
device interrupt in qemu and unmask the physical device.  Ideally we
could deassert the interrupt in KVM, which allows us to send the EOI
directly to vfio.  To do that, we need to use a new IRQ source ID so
the guest sees the logical OR of qemu requested state and external
device state.  This allows external devices to share interrupts with
emulated devices, just like KVM assignment.  That means the means we
also need to use a new source ID when injecting the interrupt via
irqfd.

Rather than creating a source ID allocation interface, extending irqfd
to support a user supplied source ID, and creating another new
interface to get the EOI out, I think it works out better to bundle
these all together as just a level irqfd interface.  This way we don't
allow users to create unbalanced states where a level interrupt is
asserted, but has no way of being deasserted.  I believe the below
does this, but needs testing and validation with an implementation in
qemu.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 arch/x86/kvm/x86.c       |    1 +
 include/linux/kvm.h      |    6 +++
 include/linux/kvm_host.h |    4 +-
 virt/kvm/eventfd.c       |   90 ++++++++++++++++++++++++++++++++++++++--------
 virt/kvm/kvm_main.c      |    2 +
 5 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a01a424..80bed07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
+	case KVM_CAP_IRQFD_LEVEL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..cca49a1 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_IRQFD_LEVEL 81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -683,12 +684,15 @@ struct kvm_xen_hvm_config {
 #endif
 
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/* Available with KVM_CAP_IRQFD_LEVEL */
+#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
 
 struct kvm_irqfd {
 	__u32 fd;
 	__u32 gsi;
 	__u32 flags;
-	__u8  pad[20];
+	__u32 eoi_fd;
+	__u8 pad[16];
 };
 
 struct kvm_clock_data {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27ac8a4..ae3b426 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -824,7 +824,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 #ifdef CONFIG_HAVE_KVM_EVENTFD
 
 void kvm_eventfd_init(struct kvm *kvm);
-int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
@@ -833,7 +833,7 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
 static inline void kvm_eventfd_init(struct kvm *kvm) {}
 
-static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	return -EINVAL;
 }
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f59c1e8..89a6fa9 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -49,9 +49,13 @@ struct _irqfd {
 	wait_queue_t wait;
 	/* Update side is protected by irqfds.lock */
 	struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
-	/* Used for level IRQ fast-path */
+	/* Used for IRQ fast-path */
 	int gsi;
 	struct work_struct inject;
+	/* Used for level EOI path */
+	int irq_source_id;
+	struct eventfd_ctx *eoi_eventfd;
+	struct kvm_irq_ack_notifier notifier;
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
@@ -62,7 +66,7 @@ struct _irqfd {
 static struct workqueue_struct *irqfd_cleanup_wq;
 
 static void
-irqfd_inject(struct work_struct *work)
+irqfd_inject_edge(struct work_struct *work)
 {
 	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;
@@ -71,6 +75,23 @@ irqfd_inject(struct work_struct *work)
 	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
 }
 
+static void
+irqfd_inject_level(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+
+	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1);
+}
+
+static void
+irqfd_ack_level(struct kvm_irq_ack_notifier *notifier)
+{
+	struct _irqfd *irqfd  = container_of(notifier, struct _irqfd, notifier);
+
+	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
+	eventfd_signal(irqfd->eoi_eventfd, 1);
+}
+
 /*
  * Race-free decouple logic (ordering is critical)
  */
@@ -96,6 +117,14 @@ irqfd_shutdown(struct work_struct *work)
 	 * It is now safe to release the object's resources
 	 */
 	eventfd_ctx_put(irqfd->eventfd);
+
+	if (irqfd->eoi_eventfd) {
+		kvm_unregister_irq_ack_notifier(irqfd->kvm, &irqfd->notifier);
+		eventfd_ctx_put(irqfd->eoi_eventfd);
+		kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
+		kvm_free_irq_source_id(irqfd->kvm, irqfd->irq_source_id);
+	}
+
 	kfree(irqfd);
 }
 
@@ -198,13 +227,13 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
 }
 
 static int
-kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	struct kvm_irq_routing_table *irq_rt;
 	struct _irqfd *irqfd, *tmp;
 	struct file *file = NULL;
-	struct eventfd_ctx *eventfd = NULL;
-	int ret;
+	struct eventfd_ctx *eventfd = NULL, *eoi_eventfd = NULL;
+	int ret, irq_source_id = -1;
 	unsigned int events;
 
 	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
@@ -212,12 +241,35 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 		return -ENOMEM;
 
 	irqfd->kvm = kvm;
-	irqfd->gsi = gsi;
+	irqfd->gsi = args->gsi;
 	INIT_LIST_HEAD(&irqfd->list);
-	INIT_WORK(&irqfd->inject, irqfd_inject);
+
+	if (args->flags & KVM_IRQFD_FLAG_LEVEL) {
+		irq_source_id = kvm_request_irq_source_id(kvm);
+		if (irq_source_id < 0) {
+			ret = irq_source_id;
+			goto fail;
+		}
+
+		eoi_eventfd = eventfd_ctx_fdget(args->eoi_fd);
+		if (IS_ERR(eoi_eventfd)) {
+			ret = PTR_ERR(eoi_eventfd);
+			goto fail;
+		}
+
+		irqfd->irq_source_id = irq_source_id;
+		irqfd->eoi_eventfd = eoi_eventfd;
+		irqfd->notifier.gsi = args->gsi;
+		irqfd->notifier.irq_acked = irqfd_ack_level;
+		kvm_register_irq_ack_notifier(kvm, &irqfd->notifier);
+
+		INIT_WORK(&irqfd->inject, irqfd_inject_level);
+	} else
+		INIT_WORK(&irqfd->inject, irqfd_inject_edge);
+
 	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 
-	file = eventfd_fget(fd);
+	file = eventfd_fget(args->fd);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
 		goto fail;
@@ -282,6 +334,14 @@ fail:
 	if (!IS_ERR(file))
 		fput(file);
 
+	if (eoi_eventfd && !IS_ERR(eoi_eventfd)) {
+		kvm_unregister_irq_ack_notifier(kvm, &irqfd->notifier);
+		eventfd_ctx_put(eoi_eventfd);
+	}
+
+	if (irq_source_id >= 0)
+		kvm_free_irq_source_id(kvm, irq_source_id);
+
 	kfree(irqfd);
 	return ret;
 }
@@ -298,19 +358,19 @@ kvm_eventfd_init(struct kvm *kvm)
  * shutdown any irqfd's that match fd+gsi
  */
 static int
-kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
+kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	struct _irqfd *irqfd, *tmp;
 	struct eventfd_ctx *eventfd;
 
-	eventfd = eventfd_ctx_fdget(fd);
+	eventfd = eventfd_ctx_fdget(args->fd);
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
 	spin_lock_irq(&kvm->irqfds.lock);
 
 	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
-		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+		if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
 			/*
 			 * This rcu_assign_pointer is needed for when
 			 * another thread calls kvm_irq_routing_update before
@@ -338,12 +398,12 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 }
 
 int
-kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
-		return kvm_irqfd_deassign(kvm, fd, gsi);
+	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
+		return kvm_irqfd_deassign(kvm, args);
 
-	return kvm_irqfd_assign(kvm, fd, gsi);
+	return kvm_irqfd_assign(kvm, args);
 }
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 02cb440..b4ad14cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2059,7 +2059,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&data, argp, sizeof data))
 			goto out;
-		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+		r = kvm_irqfd(kvm, &data);
 		break;
 	}
 	case KVM_IOEVENTFD: {


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-16 16:34 [RFC PATCH] kvm: Extend irqfd to support level interrupts Alex Williamson
@ 2012-06-17 11:17 ` Jan Kiszka
  2012-06-17 18:44 ` Michael S. Tsirkin
  2012-06-18  8:02 ` Avi Kivity
  2 siblings, 0 replies; 24+ messages in thread
From: Jan Kiszka @ 2012-06-17 11:17 UTC (permalink / raw)
  To: Alex Williamson; +Cc: avi, mtosatti, kvm, mst

[-- Attachment #1: Type: text/plain, Size: 10002 bytes --]

On 2012-06-16 18:34, Alex Williamson wrote:
> I'm looking for opinions on this approach.  For vfio device assignment
> we minimally need a way to get EOIs from the in-kernel irqchip out to
> userspace.  Getting that out via an eventfd would allow us to bounce
> all level interrupts out to userspace, where we would de-assert the
> device interrupt in qemu and unmask the physical device.  Ideally we
> could deassert the interrupt in KVM, which allows us to send the EOI
> directly to vfio.  To do that, we need to use a new IRQ source ID so
> the guest sees the logical OR of qemu requested state and external
> device state.  This allows external devices to share interrupts with
> emulated devices, just like KVM assignment.  That means the means we
> also need to use a new source ID when injecting the interrupt via
> irqfd.
> 
> Rather than creating a source ID allocation interface, extending irqfd
> to support a user supplied source ID, and creating another new
> interface to get the EOI out, I think it works out better to bundle
> these all together as just a level irqfd interface.  This way we don't
> allow users to create unbalanced states where a level interrupt is
> asserted, but has no way of being deasserted.  I believe the below
> does this, but needs testing and validation with an implementation in
> qemu.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  arch/x86/kvm/x86.c       |    1 +
>  include/linux/kvm.h      |    6 +++
>  include/linux/kvm_host.h |    4 +-
>  virt/kvm/eventfd.c       |   90 ++++++++++++++++++++++++++++++++++++++--------
>  virt/kvm/kvm_main.c      |    2 +
>  5 files changed, 84 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a01a424..80bed07 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_GET_TSC_KHZ:
>  	case KVM_CAP_PCI_2_3:
>  	case KVM_CAP_KVMCLOCK_CTRL:
> +	case KVM_CAP_IRQFD_LEVEL:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 2ce09aa..cca49a1 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_GET_SMMU_INFO 78
>  #define KVM_CAP_S390_COW 79
>  #define KVM_CAP_PPC_ALLOC_HTAB 80
> +#define KVM_CAP_IRQFD_LEVEL 81
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -683,12 +684,15 @@ struct kvm_xen_hvm_config {
>  #endif
>  
>  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> +/* Available with KVM_CAP_IRQFD_LEVEL */
> +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
>  
>  struct kvm_irqfd {
>  	__u32 fd;
>  	__u32 gsi;
>  	__u32 flags;
> -	__u8  pad[20];
> +	__u32 eoi_fd;
> +	__u8 pad[16];
>  };
>  
>  struct kvm_clock_data {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 27ac8a4..ae3b426 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -824,7 +824,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
>  #ifdef CONFIG_HAVE_KVM_EVENTFD
>  
>  void kvm_eventfd_init(struct kvm *kvm);
> -int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
> +int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
>  void kvm_irqfd_release(struct kvm *kvm);
>  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
>  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> @@ -833,7 +833,7 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
>  
>  static inline void kvm_eventfd_init(struct kvm *kvm) {}
>  
> -static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
>  {
>  	return -EINVAL;
>  }
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index f59c1e8..89a6fa9 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -49,9 +49,13 @@ struct _irqfd {
>  	wait_queue_t wait;
>  	/* Update side is protected by irqfds.lock */
>  	struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
> -	/* Used for level IRQ fast-path */
> +	/* Used for IRQ fast-path */
>  	int gsi;
>  	struct work_struct inject;
> +	/* Used for level EOI path */
> +	int irq_source_id;
> +	struct eventfd_ctx *eoi_eventfd;
> +	struct kvm_irq_ack_notifier notifier;
>  	/* Used for setup/shutdown */
>  	struct eventfd_ctx *eventfd;
>  	struct list_head list;
> @@ -62,7 +66,7 @@ struct _irqfd {
>  static struct workqueue_struct *irqfd_cleanup_wq;
>  
>  static void
> -irqfd_inject(struct work_struct *work)
> +irqfd_inject_edge(struct work_struct *work)
>  {
>  	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
>  	struct kvm *kvm = irqfd->kvm;
> @@ -71,6 +75,23 @@ irqfd_inject(struct work_struct *work)
>  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>  }
>  
> +static void
> +irqfd_inject_level(struct work_struct *work)
> +{
> +	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> +
> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1);
> +}
> +
> +static void
> +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier)
> +{
> +	struct _irqfd *irqfd  = container_of(notifier, struct _irqfd, notifier);
> +
> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
> +	eventfd_signal(irqfd->eoi_eventfd, 1);
> +}
> +
>  /*
>   * Race-free decouple logic (ordering is critical)
>   */
> @@ -96,6 +117,14 @@ irqfd_shutdown(struct work_struct *work)
>  	 * It is now safe to release the object's resources
>  	 */
>  	eventfd_ctx_put(irqfd->eventfd);
> +
> +	if (irqfd->eoi_eventfd) {
> +		kvm_unregister_irq_ack_notifier(irqfd->kvm, &irqfd->notifier);
> +		eventfd_ctx_put(irqfd->eoi_eventfd);
> +		kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
> +		kvm_free_irq_source_id(irqfd->kvm, irqfd->irq_source_id);
> +	}
> +
>  	kfree(irqfd);
>  }
>  
> @@ -198,13 +227,13 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
>  }
>  
>  static int
> -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> +kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
>  {
>  	struct kvm_irq_routing_table *irq_rt;
>  	struct _irqfd *irqfd, *tmp;
>  	struct file *file = NULL;
> -	struct eventfd_ctx *eventfd = NULL;
> -	int ret;
> +	struct eventfd_ctx *eventfd = NULL, *eoi_eventfd = NULL;
> +	int ret, irq_source_id = -1;
>  	unsigned int events;
>  
>  	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> @@ -212,12 +241,35 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  		return -ENOMEM;
>  
>  	irqfd->kvm = kvm;
> -	irqfd->gsi = gsi;
> +	irqfd->gsi = args->gsi;
>  	INIT_LIST_HEAD(&irqfd->list);
> -	INIT_WORK(&irqfd->inject, irqfd_inject);
> +
> +	if (args->flags & KVM_IRQFD_FLAG_LEVEL) {
> +		irq_source_id = kvm_request_irq_source_id(kvm);
> +		if (irq_source_id < 0) {
> +			ret = irq_source_id;
> +			goto fail;
> +		}
> +
> +		eoi_eventfd = eventfd_ctx_fdget(args->eoi_fd);
> +		if (IS_ERR(eoi_eventfd)) {
> +			ret = PTR_ERR(eoi_eventfd);
> +			goto fail;
> +		}
> +
> +		irqfd->irq_source_id = irq_source_id;
> +		irqfd->eoi_eventfd = eoi_eventfd;
> +		irqfd->notifier.gsi = args->gsi;
> +		irqfd->notifier.irq_acked = irqfd_ack_level;
> +		kvm_register_irq_ack_notifier(kvm, &irqfd->notifier);
> +
> +		INIT_WORK(&irqfd->inject, irqfd_inject_level);
> +	} else
> +		INIT_WORK(&irqfd->inject, irqfd_inject_edge);
> +
>  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
>  
> -	file = eventfd_fget(fd);
> +	file = eventfd_fget(args->fd);
>  	if (IS_ERR(file)) {
>  		ret = PTR_ERR(file);
>  		goto fail;
> @@ -282,6 +334,14 @@ fail:
>  	if (!IS_ERR(file))
>  		fput(file);
>  
> +	if (eoi_eventfd && !IS_ERR(eoi_eventfd)) {
> +		kvm_unregister_irq_ack_notifier(kvm, &irqfd->notifier);
> +		eventfd_ctx_put(eoi_eventfd);
> +	}
> +
> +	if (irq_source_id >= 0)
> +		kvm_free_irq_source_id(kvm, irq_source_id);
> +
>  	kfree(irqfd);
>  	return ret;
>  }
> @@ -298,19 +358,19 @@ kvm_eventfd_init(struct kvm *kvm)
>   * shutdown any irqfd's that match fd+gsi
>   */
>  static int
> -kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> +kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
>  {
>  	struct _irqfd *irqfd, *tmp;
>  	struct eventfd_ctx *eventfd;
>  
> -	eventfd = eventfd_ctx_fdget(fd);
> +	eventfd = eventfd_ctx_fdget(args->fd);
>  	if (IS_ERR(eventfd))
>  		return PTR_ERR(eventfd);
>  
>  	spin_lock_irq(&kvm->irqfds.lock);
>  
>  	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
> -		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
> +		if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
>  			/*
>  			 * This rcu_assign_pointer is needed for when
>  			 * another thread calls kvm_irq_routing_update before
> @@ -338,12 +398,12 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
>  }
>  
>  int
> -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> +kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
>  {
> -	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> -		return kvm_irqfd_deassign(kvm, fd, gsi);
> +	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
> +		return kvm_irqfd_deassign(kvm, args);
>  
> -	return kvm_irqfd_assign(kvm, fd, gsi);
> +	return kvm_irqfd_assign(kvm, args);
>  }
>  
>  /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 02cb440..b4ad14cc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2059,7 +2059,7 @@ static long kvm_vm_ioctl(struct file *filp,
>  		r = -EFAULT;
>  		if (copy_from_user(&data, argp, sizeof data))
>  			goto out;
> -		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
> +		r = kvm_irqfd(kvm, &data);
>  		break;
>  	}
>  	case KVM_IOEVENTFD: {
> 

Looks reasonable to me.

Jan



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-16 16:34 [RFC PATCH] kvm: Extend irqfd to support level interrupts Alex Williamson
  2012-06-17 11:17 ` Jan Kiszka
@ 2012-06-17 18:44 ` Michael S. Tsirkin
  2012-06-17 21:38   ` Alex Williamson
  2012-06-18  8:02 ` Avi Kivity
  2 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-17 18:44 UTC (permalink / raw)
  To: Alex Williamson; +Cc: avi, mtosatti, kvm, jan.kiszka

On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> I'm looking for opinions on this approach.  For vfio device assignment
> we minimally need a way to get EOIs from the in-kernel irqchip out to
> userspace.  Getting that out via an eventfd would allow us to bounce
> all level interrupts out to userspace, where we would de-assert the
> device interrupt in qemu and unmask the physical device.  Ideally we
> could deassert the interrupt in KVM, which allows us to send the EOI
> directly to vfio.  To do that, we need to use a new IRQ source ID so
> the guest sees the logical OR of qemu requested state and external
> device state.

Given that yopu want to involve userspace anyway, why insist on irqfd
for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-17 18:44 ` Michael S. Tsirkin
@ 2012-06-17 21:38   ` Alex Williamson
  2012-06-17 22:15     ` Alex Williamson
  2012-06-18  5:51     ` Michael S. Tsirkin
  0 siblings, 2 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-17 21:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: avi, mtosatti, kvm, jan.kiszka

On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
> On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> > I'm looking for opinions on this approach.  For vfio device assignment
> > we minimally need a way to get EOIs from the in-kernel irqchip out to
> > userspace.  Getting that out via an eventfd would allow us to bounce
> > all level interrupts out to userspace, where we would de-assert the
> > device interrupt in qemu and unmask the physical device.  Ideally we
> > could deassert the interrupt in KVM, which allows us to send the EOI
> > directly to vfio.  To do that, we need to use a new IRQ source ID so
> > the guest sees the logical OR of qemu requested state and external
> > device state.
> 
> Given that yopu want to involve userspace anyway, why insist on irqfd
> for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?

Well, actually I'd like to have a way to bypass userspace, which the
combination of an irqfd + eventfd w/ deassert does.  I'm not quite sure
I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
effectively gives us a way to post an interrupt AND let us know whether
it was masked, coalesced, or delivered.  So I'd have to poll by posting
a potentially spurious interrupt and if it was spurious unmask the
physical device and wait for a real interrupt?  What am I missing,
because that seems barely functional?  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-17 21:38   ` Alex Williamson
@ 2012-06-17 22:15     ` Alex Williamson
  2012-06-18  6:00       ` Michael S. Tsirkin
  2012-06-18  5:51     ` Michael S. Tsirkin
  1 sibling, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2012-06-17 22:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: avi, mtosatti, kvm, jan.kiszka

On Sun, Jun 17, 2012 at 3:38 PM, Alex Williamson
<alex.williamson@redhat.com> wrote:
> On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
>> On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
>> > I'm looking for opinions on this approach.  For vfio device assignment
>> > we minimally need a way to get EOIs from the in-kernel irqchip out to
>> > userspace.  Getting that out via an eventfd would allow us to bounce
>> > all level interrupts out to userspace, where we would de-assert the
>> > device interrupt in qemu and unmask the physical device.  Ideally we
>> > could deassert the interrupt in KVM, which allows us to send the EOI
>> > directly to vfio.  To do that, we need to use a new IRQ source ID so
>> > the guest sees the logical OR of qemu requested state and external
>> > device state.
>>
>> Given that yopu want to involve userspace anyway, why insist on irqfd
>> for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?
>
> Well, actually I'd like to have a way to bypass userspace, which the
> combination of an irqfd + eventfd w/ deassert does.  I'm not quite sure
> I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
> effectively gives us a way to post an interrupt AND let us know whether
> it was masked, coalesced, or delivered.  So I'd have to poll by posting
> a potentially spurious interrupt and if it was spurious unmask the
> physical device and wait for a real interrupt?  What am I missing,
> because that seems barely functional?  Thanks,

Just to clarify, setting the interrupt from qemu isn't a problem.  We
can do that just like any other device.  The unique aspect is that we
need to know when the guest has issued an EOI so that we can unmask
the physical device interrupt and wait for it to fire again.  This is
where I don't understand how KVM_IRQ_LINE_STATUS helps us.  The
minimal support I mention above just requires informing userspace
about the EOI, then we can deassert and unmask from qemu.  That means
we issue two more ioctl before we're enabled for the next interrupt.
Rather than invent a new interface for a sub-optimal implementation,
fixing irqfd to support level triggered interrupts is potentially more
useful and I think this implementation is not specific to device
assignment.  BTW, what happens with vhost use of irqfd when the guest
runs out of MSI vectors?  Could it use this interface for that?
Thanks,

Alex

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-17 21:38   ` Alex Williamson
  2012-06-17 22:15     ` Alex Williamson
@ 2012-06-18  5:51     ` Michael S. Tsirkin
  2012-06-18 14:06       ` Alex Williamson
  1 sibling, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-18  5:51 UTC (permalink / raw)
  To: Alex Williamson; +Cc: avi, mtosatti, kvm, jan.kiszka

On Sun, Jun 17, 2012 at 03:38:57PM -0600, Alex Williamson wrote:
> On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
> > On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> > > I'm looking for opinions on this approach.  For vfio device assignment
> > > we minimally need a way to get EOIs from the in-kernel irqchip out to
> > > userspace.  Getting that out via an eventfd would allow us to bounce
> > > all level interrupts out to userspace, where we would de-assert the
> > > device interrupt in qemu and unmask the physical device.  Ideally we
> > > could deassert the interrupt in KVM, which allows us to send the EOI
> > > directly to vfio.  To do that, we need to use a new IRQ source ID so
> > > the guest sees the logical OR of qemu requested state and external
> > > device state.
> > 
> > Given that yopu want to involve userspace anyway, why insist on irqfd
> > for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?
> 
> Well, actually I'd like to have a way to bypass userspace, which the
> combination of an irqfd + eventfd w/ deassert does.

Above you said "bounce all level interrupts out to userspace"?

>  I'm not quite sure
> I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
> effectively gives us a way to post an interrupt AND let us know whether
> it was masked, coalesced, or delivered.  So I'd have to poll by posting
> a potentially spurious interrupt and if it was spurious unmask the
> physical device and wait for a real interrupt?  What am I missing,
> because that seems barely functional?  Thanks,
> 
> Alex

Sorry if I did not make this clear.  You still need to add a way to pass
EOI writes out to userspace. But that's all you need: once
userspace gets EOI events for specific GSIs, it can control IRQ level from userspace
in the same way it does it for emulated devices:
wake up on EOI, poll all devices, if no interrupt asserted -
clear IRQ line and unmask IRQ in assigned device.

Above is with plain exit to userspace rather than eventfd,
though eventfd is also possible but then you need to clear
irq level in kernel and re-assert from userspace later -
I think it's ok but need to think about possible races.
 

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-17 22:15     ` Alex Williamson
@ 2012-06-18  6:00       ` Michael S. Tsirkin
  2012-06-18 14:00         ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-18  6:00 UTC (permalink / raw)
  To: Alex Williamson; +Cc: avi, mtosatti, kvm, jan.kiszka

On Sun, Jun 17, 2012 at 04:15:44PM -0600, Alex Williamson wrote:
> On Sun, Jun 17, 2012 at 3:38 PM, Alex Williamson
> <alex.williamson@redhat.com> wrote:
> > On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
> >> On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> >> > I'm looking for opinions on this approach.  For vfio device assignment
> >> > we minimally need a way to get EOIs from the in-kernel irqchip out to
> >> > userspace.  Getting that out via an eventfd would allow us to bounce
> >> > all level interrupts out to userspace, where we would de-assert the
> >> > device interrupt in qemu and unmask the physical device.  Ideally we
> >> > could deassert the interrupt in KVM, which allows us to send the EOI
> >> > directly to vfio.  To do that, we need to use a new IRQ source ID so
> >> > the guest sees the logical OR of qemu requested state and external
> >> > device state.
> >>
> >> Given that yopu want to involve userspace anyway, why insist on irqfd
> >> for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?
> >
> > Well, actually I'd like to have a way to bypass userspace, which the
> > combination of an irqfd + eventfd w/ deassert does.


Hmm but above you say
	> >> > Getting that out via an eventfd would allow us to bounce
	> >> > all level interrupts out to userspace, where we would de-assert the
	> >> > device interrupt in qemu and unmask the physical device.
so what is the plan?

>  I'm not quite sure
> > I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
> > effectively gives us a way to post an interrupt AND let us know whether
> > it was masked, coalesced, or delivered.  So I'd have to poll by posting
> > a potentially spurious interrupt and if it was spurious unmask the
> > physical device and wait for a real interrupt?  What am I missing,
> > because that seems barely functional?  Thanks,
> 
> Just to clarify, setting the interrupt from qemu isn't a problem.  We
> can do that just like any other device.  The unique aspect is that we
> need to know when the guest has issued an EOI so that we can unmask
> the physical device interrupt and wait for it to fire again.  This is
> where I don't understand how KVM_IRQ_LINE_STATUS helps us.
> The minimal support I mention above just requires informing userspace
> about the EOI, then we can deassert and unmask from qemu.  That means
> we issue two more ioctl before we're enabled for the next interrupt.

Exactly.

> Rather than invent a new interface for a sub-optimal implementation,
> fixing irqfd to support level triggered interrupts is potentially more
> useful and I think this implementation is not specific to device
> assignment.  BTW, what happens with vhost use of irqfd when the guest
> runs out of MSI vectors?  Could it use this interface for that?
> Thanks,
> 
> Alex


Sure. OTOH this never was a real issue - if it was
we could teach Linux to share MSI interrupt.

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-16 16:34 [RFC PATCH] kvm: Extend irqfd to support level interrupts Alex Williamson
  2012-06-17 11:17 ` Jan Kiszka
  2012-06-17 18:44 ` Michael S. Tsirkin
@ 2012-06-18  8:02 ` Avi Kivity
  2012-06-18  8:52   ` Jan Kiszka
  2012-06-18 14:18   ` Alex Williamson
  2 siblings, 2 replies; 24+ messages in thread
From: Avi Kivity @ 2012-06-18  8:02 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mtosatti, kvm, mst, jan.kiszka

On 06/16/2012 07:34 PM, Alex Williamson wrote:
> I'm looking for opinions on this approach.  For vfio device assignment
> we minimally need a way to get EOIs from the in-kernel irqchip out to
> userspace.  Getting that out via an eventfd would allow us to bounce
> all level interrupts out to userspace, where we would de-assert the
> device interrupt in qemu and unmask the physical device.  Ideally we
> could deassert the interrupt in KVM, which allows us to send the EOI
> directly to vfio.  To do that, we need to use a new IRQ source ID so
> the guest sees the logical OR of qemu requested state and external
> device state.  This allows external devices to share interrupts with
> emulated devices, just like KVM assignment.  That means the means we
> also need to use a new source ID when injecting the interrupt via
> irqfd.
> 
> Rather than creating a source ID allocation interface, extending irqfd
> to support a user supplied source ID, and creating another new
> interface to get the EOI out, I think it works out better to bundle
> these all together as just a level irqfd interface.  This way we don't
> allow users to create unbalanced states where a level interrupt is
> asserted, but has no way of being deasserted.  I believe the below
> does this, but needs testing and validation with an implementation in
> qemu.
> 

Missing documentation, which helps at least one reviewer review.  It's
not just for a commit.

>  
> +static void
> +irqfd_inject_level(struct work_struct *work)
> +{
> +	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> +
> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1);
> +}
> +
> +static void
> +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier)
> +{
> +	struct _irqfd *irqfd  = container_of(notifier, struct _irqfd, notifier);
> +
> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
> +	eventfd_signal(irqfd->eoi_eventfd, 1);
> +}
> +

I don't understand how this works.  A level IRQ isn't de-asserted by the
EOI, it's de-asserted by its source.

Consider the following sequence:

device        guest

  event
   assert
              interrupt
               interrupt handler
                handle event
                clear ISR bit
   deassert
  event
    assert
               EOI

What should happen is that the interrupt will be redelivered
immmediately after the EOI, but that won't happen with your API since
the EOI ack notifier will deassert the interrupt and nothing will
re-assert it.


-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  8:02 ` Avi Kivity
@ 2012-06-18  8:52   ` Jan Kiszka
  2012-06-18  9:33     ` Avi Kivity
  2012-06-18 14:18   ` Alex Williamson
  1 sibling, 1 reply; 24+ messages in thread
From: Jan Kiszka @ 2012-06-18  8:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Alex Williamson, mtosatti, kvm, mst

On 2012-06-18 10:02, Avi Kivity wrote:
> On 06/16/2012 07:34 PM, Alex Williamson wrote:
>> I'm looking for opinions on this approach.  For vfio device assignment
>> we minimally need a way to get EOIs from the in-kernel irqchip out to
>> userspace.  Getting that out via an eventfd would allow us to bounce
>> all level interrupts out to userspace, where we would de-assert the
>> device interrupt in qemu and unmask the physical device.  Ideally we
>> could deassert the interrupt in KVM, which allows us to send the EOI
>> directly to vfio.  To do that, we need to use a new IRQ source ID so
>> the guest sees the logical OR of qemu requested state and external
>> device state.  This allows external devices to share interrupts with
>> emulated devices, just like KVM assignment.  That means the means we
>> also need to use a new source ID when injecting the interrupt via
>> irqfd.
>>
>> Rather than creating a source ID allocation interface, extending irqfd
>> to support a user supplied source ID, and creating another new
>> interface to get the EOI out, I think it works out better to bundle
>> these all together as just a level irqfd interface.  This way we don't
>> allow users to create unbalanced states where a level interrupt is
>> asserted, but has no way of being deasserted.  I believe the below
>> does this, but needs testing and validation with an implementation in
>> qemu.
>>
> 
> Missing documentation, which helps at least one reviewer review.  It's
> not just for a commit.
> 
>>  
>> +static void
>> +irqfd_inject_level(struct work_struct *work)
>> +{
>> +	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
>> +
>> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1);
>> +}
>> +
>> +static void
>> +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier)
>> +{
>> +	struct _irqfd *irqfd  = container_of(notifier, struct _irqfd, notifier);
>> +
>> +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
>> +	eventfd_signal(irqfd->eoi_eventfd, 1);
>> +}
>> +
> 
> I don't understand how this works.  A level IRQ isn't de-asserted by the
> EOI, it's de-asserted by its source.
> 
> Consider the following sequence:
> 
> device        guest
> 
>   event
>    assert
>               interrupt
>                interrupt handler
>                 handle event
>                 clear ISR bit
>    deassert
>   event
>     assert
>                EOI
> 
> What should happen is that the interrupt will be redelivered
> immmediately after the EOI, but that won't happen with your API since
> the EOI ack notifier will deassert the interrupt and nothing will
> re-assert it.

As it's level triggered and we unmask the physical source, another
host-side interrupt will be triggered and then reported to the guest.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  8:52   ` Jan Kiszka
@ 2012-06-18  9:33     ` Avi Kivity
  2012-06-18 10:11       ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Avi Kivity @ 2012-06-18  9:33 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Alex Williamson, mtosatti, kvm, mst

On 06/18/2012 11:52 AM, Jan Kiszka wrote:
>> 
>> I don't understand how this works.  A level IRQ isn't de-asserted by the
>> EOI, it's de-asserted by its source.
>> 
>> Consider the following sequence:
>> 
>> device        guest
>> 
>>   event
>>    assert
>>               interrupt
>>                interrupt handler
>>                 handle event
>>                 clear ISR bit
>>    deassert
>>   event
>>     assert
>>                EOI
>> 
>> What should happen is that the interrupt will be redelivered
>> immmediately after the EOI, but that won't happen with your API since
>> the EOI ack notifier will deassert the interrupt and nothing will
>> re-assert it.
> 
> As it's level triggered and we unmask the physical source, another
> host-side interrupt will be triggered and then reported to the guest.

That works for real devices.  What about emulated devices (vhost,
msi-less ivshmem clone)?

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  9:33     ` Avi Kivity
@ 2012-06-18 10:11       ` Michael S. Tsirkin
  2012-06-18 10:14         ` Avi Kivity
  2012-06-18 14:23         ` Alex Williamson
  0 siblings, 2 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-18 10:11 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Jan Kiszka, Alex Williamson, mtosatti, kvm

On Mon, Jun 18, 2012 at 12:33:01PM +0300, Avi Kivity wrote:
> On 06/18/2012 11:52 AM, Jan Kiszka wrote:
> >> 
> >> I don't understand how this works.  A level IRQ isn't de-asserted by the
> >> EOI, it's de-asserted by its source.
> >> 
> >> Consider the following sequence:
> >> 
> >> device        guest
> >> 
> >>   event
> >>    assert
> >>               interrupt
> >>                interrupt handler
> >>                 handle event
> >>                 clear ISR bit
> >>    deassert
> >>   event
> >>     assert
> >>                EOI
> >> 
> >> What should happen is that the interrupt will be redelivered
> >> immmediately after the EOI, but that won't happen with your API since
> >> the EOI ack notifier will deassert the interrupt and nothing will
> >> re-assert it.
> > 
> > As it's level triggered and we unmask the physical source, another
> > host-side interrupt will be triggered and then reported to the guest.
> 
> That works for real devices.  What about emulated devices

It's broken for userspace too.  I guess it
should track the logical state of the device per source id.  On ack, it
should clear it for assigned devices only, do the
logical OR over all source IDs and set level to that.

> (vhost,
> msi-less ivshmem clone)?

I guess vhost can poll eventfd and reinject an interrupt.
Of course to bypass qemu completely we also need to support reads over
ioeventfd somehow.


> -- 
> error compiling committee.c: too many arguments to function
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 10:11       ` Michael S. Tsirkin
@ 2012-06-18 10:14         ` Avi Kivity
  2012-06-18 10:55           ` Michael S. Tsirkin
  2012-06-18 14:27           ` Alex Williamson
  2012-06-18 14:23         ` Alex Williamson
  1 sibling, 2 replies; 24+ messages in thread
From: Avi Kivity @ 2012-06-18 10:14 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jan Kiszka, Alex Williamson, mtosatti, kvm

On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:

>> (vhost,
>> msi-less ivshmem clone)?
> 
> I guess vhost can poll eventfd and reinject an interrupt.
> Of course to bypass qemu completely we also need to support reads over
> ioeventfd somehow.
> 

eventfd is not suitable for level triggered interrupts as far as I can
tell.  It's about passing edges, and level interrupts aren't.  We need
something like a pipe for communicating state (or just use KVM_IRQ_LINE).


-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 10:14         ` Avi Kivity
@ 2012-06-18 10:55           ` Michael S. Tsirkin
  2012-06-18 11:03             ` Avi Kivity
  2012-06-18 14:27           ` Alex Williamson
  1 sibling, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-18 10:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Jan Kiszka, Alex Williamson, mtosatti, kvm

On Mon, Jun 18, 2012 at 01:14:13PM +0300, Avi Kivity wrote:
> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
> 
> >> (vhost,
> >> msi-less ivshmem clone)?
> > 
> > I guess vhost can poll eventfd and reinject an interrupt.
> > Of course to bypass qemu completely we also need to support reads over
> > ioeventfd somehow.
> > 
> 
> eventfd is not suitable for level triggered interrupts as far as I can
> tell.  It's about passing edges, and level interrupts aren't.  We need
> something like a pipe for communicating state (or just use KVM_IRQ_LINE).

Just using KVM_IRQ_LINE won't be enough.
There's no software event when device deasserts the
line, and no stadard way for drivers to do this.
So kvm needs to trigger on some event to poll the real interrupt
state, to check whether it's ok to clear for the guest,
and forward it to userspace.


> -- 
> error compiling committee.c: too many arguments to function
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 10:55           ` Michael S. Tsirkin
@ 2012-06-18 11:03             ` Avi Kivity
  2012-06-18 11:17               ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Avi Kivity @ 2012-06-18 11:03 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jan Kiszka, Alex Williamson, mtosatti, kvm

On 06/18/2012 01:55 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 18, 2012 at 01:14:13PM +0300, Avi Kivity wrote:
>> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
>> 
>> >> (vhost,
>> >> msi-less ivshmem clone)?
>> > 
>> > I guess vhost can poll eventfd and reinject an interrupt.
>> > Of course to bypass qemu completely we also need to support reads over
>> > ioeventfd somehow.
>> > 
>> 
>> eventfd is not suitable for level triggered interrupts as far as I can
>> tell.  It's about passing edges, and level interrupts aren't.  We need
>> something like a pipe for communicating state (or just use KVM_IRQ_LINE).
> 
> Just using KVM_IRQ_LINE won't be enough.
> There's no software event when device deasserts the
> line, and no stadard way for drivers to do this.
> So kvm needs to trigger on some event to poll the real interrupt
> state, to check whether it's ok to clear for the guest,
> and forward it to userspace.

This in turn only applies to real devices.  Emulated devices of course
know the state of the emulated interrupt line.

Maybe we need different interfaces then.

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 11:03             ` Avi Kivity
@ 2012-06-18 11:17               ` Michael S. Tsirkin
  2012-06-18 14:32                 ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2012-06-18 11:17 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Jan Kiszka, Alex Williamson, mtosatti, kvm

On Mon, Jun 18, 2012 at 02:03:40PM +0300, Avi Kivity wrote:
> On 06/18/2012 01:55 PM, Michael S. Tsirkin wrote:
> > On Mon, Jun 18, 2012 at 01:14:13PM +0300, Avi Kivity wrote:
> >> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
> >> 
> >> >> (vhost,
> >> >> msi-less ivshmem clone)?
> >> > 
> >> > I guess vhost can poll eventfd and reinject an interrupt.
> >> > Of course to bypass qemu completely we also need to support reads over
> >> > ioeventfd somehow.
> >> > 
> >> 
> >> eventfd is not suitable for level triggered interrupts as far as I can
> >> tell.  It's about passing edges, and level interrupts aren't.  We need
> >> something like a pipe for communicating state (or just use KVM_IRQ_LINE).
> > 
> > Just using KVM_IRQ_LINE won't be enough.
> > There's no software event when device deasserts the
> > line, and no stadard way for drivers to do this.
> > So kvm needs to trigger on some event to poll the real interrupt
> > state, to check whether it's ok to clear for the guest,
> > and forward it to userspace.
> 
> This in turn only applies to real devices.  Emulated devices of course
> know the state of the emulated interrupt line.

Or to be more exact, real devices know the state of the line too.
It's the hyprvisor that doesn't know the state, because
of the abovementioned lack of standards, and because
the state control can be mapped directly into guest.

For example the hypervisor knows about virtio ISR
register so there's no problem.

> Maybe we need different interfaces then.
> 
> -- 
> error compiling committee.c: too many arguments to function
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  6:00       ` Michael S. Tsirkin
@ 2012-06-18 14:00         ` Alex Williamson
  0 siblings, 0 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:00 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: avi, mtosatti, kvm, jan.kiszka

On Mon, 2012-06-18 at 09:00 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 17, 2012 at 04:15:44PM -0600, Alex Williamson wrote:
> > On Sun, Jun 17, 2012 at 3:38 PM, Alex Williamson
> > <alex.williamson@redhat.com> wrote:
> > > On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
> > >> On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> > >> > I'm looking for opinions on this approach.  For vfio device assignment
> > >> > we minimally need a way to get EOIs from the in-kernel irqchip out to
> > >> > userspace.  Getting that out via an eventfd would allow us to bounce
> > >> > all level interrupts out to userspace, where we would de-assert the
> > >> > device interrupt in qemu and unmask the physical device.  Ideally we
> > >> > could deassert the interrupt in KVM, which allows us to send the EOI
> > >> > directly to vfio.  To do that, we need to use a new IRQ source ID so
> > >> > the guest sees the logical OR of qemu requested state and external
> > >> > device state.
> > >>
> > >> Given that yopu want to involve userspace anyway, why insist on irqfd
> > >> for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?
> > >
> > > Well, actually I'd like to have a way to bypass userspace, which the
> > > combination of an irqfd + eventfd w/ deassert does.
> 
> 
> Hmm but above you say
> 	> >> > Getting that out via an eventfd would allow us to bounce
> 	> >> > all level interrupts out to userspace, where we would de-assert the
> 	> >> > device interrupt in qemu and unmask the physical device.
> so what is the plan?

Sorry if this wasn't clear, I was attempting to state the minimal
required support vs a more ideal scenario.  The patch here ignores the
minimal support and implements the higher performance route.

> >  I'm not quite sure
> > > I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
> > > effectively gives us a way to post an interrupt AND let us know whether
> > > it was masked, coalesced, or delivered.  So I'd have to poll by posting
> > > a potentially spurious interrupt and if it was spurious unmask the
> > > physical device and wait for a real interrupt?  What am I missing,
> > > because that seems barely functional?  Thanks,
> > 
> > Just to clarify, setting the interrupt from qemu isn't a problem.  We
> > can do that just like any other device.  The unique aspect is that we
> > need to know when the guest has issued an EOI so that we can unmask
> > the physical device interrupt and wait for it to fire again.  This is
> > where I don't understand how KVM_IRQ_LINE_STATUS helps us.
> > The minimal support I mention above just requires informing userspace
> > about the EOI, then we can deassert and unmask from qemu.  That means
> > we issue two more ioctl before we're enabled for the next interrupt.
> 
> Exactly.
> 
> > Rather than invent a new interface for a sub-optimal implementation,
> > fixing irqfd to support level triggered interrupts is potentially more
> > useful and I think this implementation is not specific to device
> > assignment.  BTW, what happens with vhost use of irqfd when the guest
> > runs out of MSI vectors?  Could it use this interface for that?
> > Thanks,
> > 
> > Alex
> 
> 
> Sure. OTOH this never was a real issue - if it was
> we could teach Linux to share MSI interrupt.


Or optimize the route without changing the guest.  Thanks,

Alex



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  5:51     ` Michael S. Tsirkin
@ 2012-06-18 14:06       ` Alex Williamson
  0 siblings, 0 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: avi, mtosatti, kvm, jan.kiszka

On Mon, 2012-06-18 at 08:51 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 17, 2012 at 03:38:57PM -0600, Alex Williamson wrote:
> > On Sun, 2012-06-17 at 21:44 +0300, Michael S. Tsirkin wrote:
> > > On Sat, Jun 16, 2012 at 10:34:39AM -0600, Alex Williamson wrote:
> > > > I'm looking for opinions on this approach.  For vfio device assignment
> > > > we minimally need a way to get EOIs from the in-kernel irqchip out to
> > > > userspace.  Getting that out via an eventfd would allow us to bounce
> > > > all level interrupts out to userspace, where we would de-assert the
> > > > device interrupt in qemu and unmask the physical device.  Ideally we
> > > > could deassert the interrupt in KVM, which allows us to send the EOI
> > > > directly to vfio.  To do that, we need to use a new IRQ source ID so
> > > > the guest sees the logical OR of qemu requested state and external
> > > > device state.
> > > 
> > > Given that yopu want to involve userspace anyway, why insist on irqfd
> > > for this?  You can simply use KVM_IRQ_LINE_STATUS from qemu, no?
> > 
> > Well, actually I'd like to have a way to bypass userspace, which the
> > combination of an irqfd + eventfd w/ deassert does.
> 
> Above you said "bounce all level interrupts out to userspace"?

And then go on to discuss what I'd rather implement instead.

> >  I'm not quite sure
> > I understand how KVM_IRQ_LINE_STATUS would work for this.  AIUI, that
> > effectively gives us a way to post an interrupt AND let us know whether
> > it was masked, coalesced, or delivered.  So I'd have to poll by posting
> > a potentially spurious interrupt and if it was spurious unmask the
> > physical device and wait for a real interrupt?  What am I missing,
> > because that seems barely functional?  Thanks,
> > 
> > Alex
> 
> Sorry if I did not make this clear.  You still need to add a way to pass
> EOI writes out to userspace. But that's all you need: once
> userspace gets EOI events for specific GSIs, it can control IRQ level from userspace
> in the same way it does it for emulated devices:
> wake up on EOI, poll all devices, if no interrupt asserted -
> clear IRQ line and unmask IRQ in assigned device.

Right, this is all we need for a completely mediocre implementation.

> Above is with plain exit to userspace rather than eventfd,
> though eventfd is also possible but then you need to clear
> irq level in kernel and re-assert from userspace later -
> I think it's ok but need to think about possible races.

And this is more along the lines of what I'm attempting to do, using a
new irq source id so that the irqchip handles taking the OR of inputs.
We can always reassert the interrupt if the de-assert was unnecessary.
I thought it more beneficial to optimize for the interrupt being
handled.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18  8:02 ` Avi Kivity
  2012-06-18  8:52   ` Jan Kiszka
@ 2012-06-18 14:18   ` Alex Williamson
  2012-06-18 14:35     ` Avi Kivity
  1 sibling, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: mtosatti, kvm, mst, jan.kiszka

On Mon, 2012-06-18 at 11:02 +0300, Avi Kivity wrote:
> On 06/16/2012 07:34 PM, Alex Williamson wrote:
> > I'm looking for opinions on this approach.  For vfio device assignment
> > we minimally need a way to get EOIs from the in-kernel irqchip out to
> > userspace.  Getting that out via an eventfd would allow us to bounce
> > all level interrupts out to userspace, where we would de-assert the
> > device interrupt in qemu and unmask the physical device.  Ideally we
> > could deassert the interrupt in KVM, which allows us to send the EOI
> > directly to vfio.  To do that, we need to use a new IRQ source ID so
> > the guest sees the logical OR of qemu requested state and external
> > device state.  This allows external devices to share interrupts with
> > emulated devices, just like KVM assignment.  That means the means we
> > also need to use a new source ID when injecting the interrupt via
> > irqfd.
> > 
> > Rather than creating a source ID allocation interface, extending irqfd
> > to support a user supplied source ID, and creating another new
> > interface to get the EOI out, I think it works out better to bundle
> > these all together as just a level irqfd interface.  This way we don't
> > allow users to create unbalanced states where a level interrupt is
> > asserted, but has no way of being deasserted.  I believe the below
> > does this, but needs testing and validation with an implementation in
> > qemu.
> > 
> 
> Missing documentation, which helps at least one reviewer review.  It's
> not just for a commit.

Yeah, sorry.  There's no irqfd documentation and I didn't a chance to
write it so I could add my blurb to it.
  
> > +static void
> > +irqfd_inject_level(struct work_struct *work)
> > +{
> > +	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> > +
> > +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1);
> > +}
> > +
> > +static void
> > +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier)
> > +{
> > +	struct _irqfd *irqfd  = container_of(notifier, struct _irqfd, notifier);
> > +
> > +	kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0);
> > +	eventfd_signal(irqfd->eoi_eventfd, 1);
> > +}
> > +
> 
> I don't understand how this works.  A level IRQ isn't de-asserted by the
> EOI, it's de-asserted by its source.
> 
> Consider the following sequence:
> 
> device        guest
> 
>   event
>    assert
>               interrupt
>                interrupt handler
>                 handle event
>                 clear ISR bit
>    deassert
>   event
>     assert
>                EOI
> 
> What should happen is that the interrupt will be redelivered
> immmediately after the EOI, but that won't happen with your API since
> the EOI ack notifier will deassert the interrupt and nothing will
> re-assert it.

A device that can de-assert it's own interrupt based on register/config
access probably isn't going to subscribe to this interface.  Such a
device already has much greater visibility of it's service needs.  In
the case where we have external devices, they'll reassert the interrupt,
which is part of this api that will need to be documented.

Comparing this to real hardware, an eoi causes the ioapic to reassert
the interrupt if any of it's inputs are still active.  Here we
preemptively de-assert our interrupt and require the user of the
interface to re-assert.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 10:11       ` Michael S. Tsirkin
  2012-06-18 10:14         ` Avi Kivity
@ 2012-06-18 14:23         ` Alex Williamson
  1 sibling, 0 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:23 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Jan Kiszka, mtosatti, kvm

On Mon, 2012-06-18 at 13:11 +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 18, 2012 at 12:33:01PM +0300, Avi Kivity wrote:
> > On 06/18/2012 11:52 AM, Jan Kiszka wrote:
> > >> 
> > >> I don't understand how this works.  A level IRQ isn't de-asserted by the
> > >> EOI, it's de-asserted by its source.
> > >> 
> > >> Consider the following sequence:
> > >> 
> > >> device        guest
> > >> 
> > >>   event
> > >>    assert
> > >>               interrupt
> > >>                interrupt handler
> > >>                 handle event
> > >>                 clear ISR bit
> > >>    deassert
> > >>   event
> > >>     assert
> > >>                EOI
> > >> 
> > >> What should happen is that the interrupt will be redelivered
> > >> immmediately after the EOI, but that won't happen with your API since
> > >> the EOI ack notifier will deassert the interrupt and nothing will
> > >> re-assert it.
> > > 
> > > As it's level triggered and we unmask the physical source, another
> > > host-side interrupt will be triggered and then reported to the guest.
> > 
> > That works for real devices.  What about emulated devices
> 
> It's broken for userspace too.  I guess it
> should track the logical state of the device per source id.  On ack, it
> should clear it for assigned devices only, do the
> logical OR over all source IDs and set level to that.

That's basically what it's doing.  Maybe assigned devices are the only
user of this interface, but I'm trying not to make it device assignment
specific.  I'm grabbing a new source id for each user of this interface
specifically so the irqchip just sees it as one of many inputs.

> > (vhost,
> > msi-less ivshmem clone)?
> 
> I guess vhost can poll eventfd and reinject an interrupt.
> Of course to bypass qemu completely we also need to support reads over
> ioeventfd somehow.

If you're going to reinject an interrupt anyway, wouldn't you rather be
told when to do it via an eventfd like this instead of poll?  Thanks,

Alex



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 10:14         ` Avi Kivity
  2012-06-18 10:55           ` Michael S. Tsirkin
@ 2012-06-18 14:27           ` Alex Williamson
  2012-06-18 14:33             ` Avi Kivity
  1 sibling, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Michael S. Tsirkin, Jan Kiszka, mtosatti, kvm

On Mon, 2012-06-18 at 13:14 +0300, Avi Kivity wrote:
> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
> 
> >> (vhost,
> >> msi-less ivshmem clone)?
> > 
> > I guess vhost can poll eventfd and reinject an interrupt.
> > Of course to bypass qemu completely we also need to support reads over
> > ioeventfd somehow.
> > 
> 
> eventfd is not suitable for level triggered interrupts as far as I can
> tell.  It's about passing edges, and level interrupts aren't.  We need
> something like a pipe for communicating state (or just use KVM_IRQ_LINE).

Isn't level just two edges with a state in between?  Sure we may be
unnecessarily de-asserting for brief periods, but for an external
assigned device, we don't know whether it's unnecessary.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 11:17               ` Michael S. Tsirkin
@ 2012-06-18 14:32                 ` Alex Williamson
  0 siblings, 0 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 14:32 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Jan Kiszka, mtosatti, kvm

On Mon, 2012-06-18 at 14:17 +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 18, 2012 at 02:03:40PM +0300, Avi Kivity wrote:
> > On 06/18/2012 01:55 PM, Michael S. Tsirkin wrote:
> > > On Mon, Jun 18, 2012 at 01:14:13PM +0300, Avi Kivity wrote:
> > >> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
> > >> 
> > >> >> (vhost,
> > >> >> msi-less ivshmem clone)?
> > >> > 
> > >> > I guess vhost can poll eventfd and reinject an interrupt.
> > >> > Of course to bypass qemu completely we also need to support reads over
> > >> > ioeventfd somehow.
> > >> > 
> > >> 
> > >> eventfd is not suitable for level triggered interrupts as far as I can
> > >> tell.  It's about passing edges, and level interrupts aren't.  We need
> > >> something like a pipe for communicating state (or just use KVM_IRQ_LINE).
> > > 
> > > Just using KVM_IRQ_LINE won't be enough.
> > > There's no software event when device deasserts the
> > > line, and no stadard way for drivers to do this.
> > > So kvm needs to trigger on some event to poll the real interrupt
> > > state, to check whether it's ok to clear for the guest,
> > > and forward it to userspace.
> > 
> > This in turn only applies to real devices.  Emulated devices of course
> > know the state of the emulated interrupt line.
> 
> Or to be more exact, real devices know the state of the line too.
> It's the hyprvisor that doesn't know the state, because
> of the abovementioned lack of standards, and because
> the state control can be mapped directly into guest.

Yes, exactly.

> For example the hypervisor knows about virtio ISR
> register so there's no problem.
> 
> > Maybe we need different interfaces then.

This interface is perhaps not as useful for anyone else as I was hoping,
but is it the wrong interface?  It still seems like a logical extension
of irqfd, even if the use case is small.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 14:27           ` Alex Williamson
@ 2012-06-18 14:33             ` Avi Kivity
  2012-06-18 16:47               ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Avi Kivity @ 2012-06-18 14:33 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Michael S. Tsirkin, Jan Kiszka, mtosatti, kvm

On 06/18/2012 05:27 PM, Alex Williamson wrote:
> On Mon, 2012-06-18 at 13:14 +0300, Avi Kivity wrote:
>> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
>> 
>> >> (vhost,
>> >> msi-less ivshmem clone)?
>> > 
>> > I guess vhost can poll eventfd and reinject an interrupt.
>> > Of course to bypass qemu completely we also need to support reads over
>> > ioeventfd somehow.
>> > 
>> 
>> eventfd is not suitable for level triggered interrupts as far as I can
>> tell.  It's about passing edges, and level interrupts aren't.  We need
>> something like a pipe for communicating state (or just use KVM_IRQ_LINE).
> 
> Isn't level just two edges with a state in between?  Sure we may be
> unnecessarily de-asserting for brief periods, but for an external
> assigned device, we don't know whether it's unnecessary.  Thanks,

For an assigned device you have someone maintaining the state and keep
pushing you edges as long as it's up.  So it's okay for you to keep
deasserting the interrupt as the device will fix it up for you.  For an
emulated device, you don't, so it isn't.

btw, technically we're in the wrong here.  An interrupt handler may read
the irr and see an incorrect value.  Of course no one does that so we
can get away with it.


-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 14:18   ` Alex Williamson
@ 2012-06-18 14:35     ` Avi Kivity
  0 siblings, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2012-06-18 14:35 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mtosatti, kvm, mst, jan.kiszka

On 06/18/2012 05:18 PM, Alex Williamson wrote:
>> 
>> I don't understand how this works.  A level IRQ isn't de-asserted by the
>> EOI, it's de-asserted by its source.
>> 
>> Consider the following sequence:
>> 
>> device        guest
>> 
>>   event
>>    assert
>>               interrupt
>>                interrupt handler
>>                 handle event
>>                 clear ISR bit
>>    deassert
>>   event
>>     assert
>>                EOI
>> 
>> What should happen is that the interrupt will be redelivered
>> immmediately after the EOI, but that won't happen with your API since
>> the EOI ack notifier will deassert the interrupt and nothing will
>> re-assert it.
> 
> A device that can de-assert it's own interrupt based on register/config
> access probably isn't going to subscribe to this interface.  Such a
> device already has much greater visibility of it's service needs.  In
> the case where we have external devices, they'll reassert the interrupt,
> which is part of this api that will need to be documented.
> 
> Comparing this to real hardware, an eoi causes the ioapic to reassert
> the interrupt if any of it's inputs are still active.  Here we
> preemptively de-assert our interrupt and require the user of the
> interface to re-assert.  Thanks,

Okay.  I guess this limits it assigned devices (which is fine, it just
needs to be documented clearly).

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] kvm: Extend irqfd to support level interrupts
  2012-06-18 14:33             ` Avi Kivity
@ 2012-06-18 16:47               ` Alex Williamson
  0 siblings, 0 replies; 24+ messages in thread
From: Alex Williamson @ 2012-06-18 16:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Michael S. Tsirkin, Jan Kiszka, mtosatti, kvm

On Mon, 2012-06-18 at 17:33 +0300, Avi Kivity wrote:
> On 06/18/2012 05:27 PM, Alex Williamson wrote:
> > On Mon, 2012-06-18 at 13:14 +0300, Avi Kivity wrote:
> >> On 06/18/2012 01:11 PM, Michael S. Tsirkin wrote:
> >> 
> >> >> (vhost,
> >> >> msi-less ivshmem clone)?
> >> > 
> >> > I guess vhost can poll eventfd and reinject an interrupt.
> >> > Of course to bypass qemu completely we also need to support reads over
> >> > ioeventfd somehow.
> >> > 
> >> 
> >> eventfd is not suitable for level triggered interrupts as far as I can
> >> tell.  It's about passing edges, and level interrupts aren't.  We need
> >> something like a pipe for communicating state (or just use KVM_IRQ_LINE).
> > 
> > Isn't level just two edges with a state in between?  Sure we may be
> > unnecessarily de-asserting for brief periods, but for an external
> > assigned device, we don't know whether it's unnecessary.  Thanks,
> 
> For an assigned device you have someone maintaining the state and keep
> pushing you edges as long as it's up.  So it's okay for you to keep
> deasserting the interrupt as the device will fix it up for you.  For an
> emulated device, you don't, so it isn't.

An emulated device doesn't know enough about it's own state to figure
out whether it should re-assert an interrupt?  Somehow I'm doubtful of
that.  What would be on the other end of that pipe if an emulated device
doesn't know when it's interrupting and when it's done?  Anyway, they're
probably not using an IRQFD to start with.

What I'm proposing emulates level signaling using something like this:

<device>: "Hey, push the button (assert interrupt)!"
<kvm>: "Bob (the guest) told me to let go of the button (deassert interrupt), let me know if I should push it again"

Another valid emulation of that is:

<device>: "Hey, push the button!"
<device>: "Let go of the button"

Nobody currently needs the latter, but the latter doesn't replace the
former for device assignment.  I can rename variables and flags to make
it more compatible with some future implementation of the latter if
that's helpful.

> btw, technically we're in the wrong here.  An interrupt handler may read
> the irr and see an incorrect value.  Of course no one does that so we
> can get away with it.

Hmm, you're speaking of existing irr emulation?  I don't see how this
makes anything less accurate or prevents the irr from being emulated
accurately through this patch.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2012-06-18 16:47 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-16 16:34 [RFC PATCH] kvm: Extend irqfd to support level interrupts Alex Williamson
2012-06-17 11:17 ` Jan Kiszka
2012-06-17 18:44 ` Michael S. Tsirkin
2012-06-17 21:38   ` Alex Williamson
2012-06-17 22:15     ` Alex Williamson
2012-06-18  6:00       ` Michael S. Tsirkin
2012-06-18 14:00         ` Alex Williamson
2012-06-18  5:51     ` Michael S. Tsirkin
2012-06-18 14:06       ` Alex Williamson
2012-06-18  8:02 ` Avi Kivity
2012-06-18  8:52   ` Jan Kiszka
2012-06-18  9:33     ` Avi Kivity
2012-06-18 10:11       ` Michael S. Tsirkin
2012-06-18 10:14         ` Avi Kivity
2012-06-18 10:55           ` Michael S. Tsirkin
2012-06-18 11:03             ` Avi Kivity
2012-06-18 11:17               ` Michael S. Tsirkin
2012-06-18 14:32                 ` Alex Williamson
2012-06-18 14:27           ` Alex Williamson
2012-06-18 14:33             ` Avi Kivity
2012-06-18 16:47               ` Alex Williamson
2012-06-18 14:23         ` Alex Williamson
2012-06-18 14:18   ` Alex Williamson
2012-06-18 14:35     ` Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.