All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] kvm: enable irq injection from interrupt context
@ 2010-09-15 18:54 Michael S. Tsirkin
  2010-09-16  9:02 ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-15 18:54 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti, Gleb Natapov, kvm, linux-kernel

To avoid bouncing irqfd injection out to workqueue context,
we must support injecting irqs from local interrupt
context. Doing this seems to only require disabling
irqs locally.

RFC, completely untested, x86 only.

Thoughts?

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/kvm/i8259.c |   12 ++++++++++++
 virt/kvm/eventfd.c   |   30 +++++++++---------------------
 virt/kvm/ioapic.c    |   34 ++++++++++++++++++++--------------
 3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 8d10c06..294fa36 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -198,8 +198,10 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 int kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
+	unsigned long flags;
 	int ret = -1;
 
+	local_irq_save(flags);
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
@@ -208,6 +210,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 				      s->pics[irq >> 3].imr, ret == 0);
 	}
 	pic_unlock(s);
+	local_irq_restore(flags);
 
 	return ret;
 }
@@ -235,8 +238,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 int kvm_pic_read_irq(struct kvm *kvm)
 {
 	int irq, irq2, intno;
+	unsigned long flags;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
+	local_irq_save(flags);
 	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
@@ -263,6 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	}
 	pic_update_irq(s);
 	pic_unlock(s);
+	local_irq_restore(flags);
 
 	return intno;
 }
@@ -476,6 +482,7 @@ static int picdev_write(struct kvm_io_device *this,
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
+	unsigned long flags;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
 
@@ -484,6 +491,7 @@ static int picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return 0;
 	}
+	local_irq_save(flags);
 	pic_lock(s);
 	switch (addr) {
 	case 0x20:
@@ -498,6 +506,7 @@ static int picdev_write(struct kvm_io_device *this,
 		break;
 	}
 	pic_unlock(s);
+	local_irq_restore(flags);
 	return 0;
 }
 
@@ -506,6 +515,7 @@ static int picdev_read(struct kvm_io_device *this,
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = 0;
+	unsigned long flags;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
 
@@ -514,6 +524,7 @@ static int picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return 0;
 	}
+	local_irq_save(flags);
 	pic_lock(s);
 	switch (addr) {
 	case 0x20:
@@ -529,6 +540,7 @@ static int picdev_read(struct kvm_io_device *this,
 	}
 	*(unsigned char *)val = data;
 	pic_unlock(s);
+	local_irq_restore(flags);
 	return 0;
 }
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 66cf65b..30ede90 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -50,22 +50,11 @@ struct _irqfd {
 	struct list_head          list;
 	poll_table                pt;
 	wait_queue_t              wait;
-	struct work_struct        inject;
 	struct work_struct        shutdown;
 };
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 
-static void
-irqfd_inject(struct work_struct *work)
-{
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
-	struct kvm *kvm = irqfd->kvm;
-
-	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-}
-
 /*
  * Race-free decouple logic (ordering is critical)
  */
@@ -82,12 +71,6 @@ irqfd_shutdown(struct work_struct *work)
 	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
 
 	/*
-	 * We know no new events will be scheduled at this point, so block
-	 * until all previously outstanding events have completed
-	 */
-	flush_work(&irqfd->inject);
-
-	/*
 	 * It is now safe to release the object's resources
 	 */
 	eventfd_ctx_put(irqfd->eventfd);
@@ -128,7 +111,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 
 	if (flags & POLLIN)
 		/* An event has been signaled, inject an interrupt */
-		schedule_work(&irqfd->inject);
+		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+			    irqfd->gsi, 1);
 
 	if (flags & POLLHUP) {
 		/* The eventfd is closing, detach from KVM */
@@ -179,7 +163,6 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 	irqfd->kvm = kvm;
 	irqfd->gsi = gsi;
 	INIT_LIST_HEAD(&irqfd->list);
-	INIT_WORK(&irqfd->inject, irqfd_inject);
 	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 
 	file = eventfd_fget(fd);
@@ -218,14 +201,19 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 	events = file->f_op->poll(file, &irqfd->pt);
 
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
-	spin_unlock_irq(&kvm->irqfds.lock);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
+	 *
+	 * Do this under irqfds.lock, this way we can be sure
+	 * the irqfd is not going away.
 	 */
 	if (events & POLLIN)
-		schedule_work(&irqfd->inject);
+		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+			    irqfd->gsi, 1);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
 
 	/*
 	 * do not drop the file until the irqfd is fully initialized, otherwise
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0b9df83..cc1d2fe 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,8 +197,9 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 	u32 mask = 1 << irq;
 	union kvm_ioapic_redirect_entry entry;
 	int ret = 1;
+	unsigned long flags;
 
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	old_irr = ioapic->irr;
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
 		entry = ioapic->redirtbl[irq];
@@ -216,7 +217,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 		}
 		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
 	}
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 
 	return ret;
 }
@@ -240,9 +241,9 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 		 * is dropped it will be put into irr and will be delivered
 		 * after ack notifier returns.
 		 */
-		spin_unlock(&ioapic->lock);
+		spin_unlock_irqrestore(&ioapic->lock, flags);
 		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
-		spin_lock(&ioapic->lock);
+		spin_lock_irqsave(&ioapic->lock, flags);
 
 		if (trigger_mode != IOAPIC_LEVEL_TRIG)
 			continue;
@@ -257,13 +258,14 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	unsigned long flags;
 
 	smp_rmb();
 	if (!test_bit(vector, ioapic->handled_vectors))
 		return;
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 }
 
 static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
@@ -281,6 +283,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 			    void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
+	unsigned long flags;
 	u32 result;
 	if (!ioapic_in_range(ioapic, addr))
 		return -EOPNOTSUPP;
@@ -289,7 +292,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	ASSERT(!(addr & 0xf));	/* check alignment */
 
 	addr &= 0xff;
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	switch (addr) {
 	case IOAPIC_REG_SELECT:
 		result = ioapic->ioregsel;
@@ -303,7 +306,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 		result = 0;
 		break;
 	}
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 
 	switch (len) {
 	case 8:
@@ -324,6 +327,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 			     const void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
+	unsigned long flags;
 	u32 data;
 	if (!ioapic_in_range(ioapic, addr))
 		return -EOPNOTSUPP;
@@ -340,7 +344,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	}
 
 	addr &= 0xff;
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	switch (addr) {
 	case IOAPIC_REG_SELECT:
 		ioapic->ioregsel = data;
@@ -358,7 +362,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	default:
 		break;
 	}
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 	return 0;
 }
 
@@ -418,24 +422,26 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	unsigned long flags;
 	if (!ioapic)
 		return -EINVAL;
 
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 	return 0;
 }
 
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	unsigned long flags;
 	if (!ioapic)
 		return -EINVAL;
 
-	spin_lock(&ioapic->lock);
+	spin_lock_irqsave(&ioapic->lock, flags);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	update_handled_vectors(ioapic);
-	spin_unlock(&ioapic->lock);
+	spin_unlock_irqrestore(&ioapic->lock, flags);
 	return 0;
 }
-- 
1.7.3.rc1.5.ge5969

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-15 18:54 [PATCH RFC] kvm: enable irq injection from interrupt context Michael S. Tsirkin
@ 2010-09-16  9:02 ` Gleb Natapov
  2010-09-16  9:10   ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16  9:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
> To avoid bouncing irqfd injection out to workqueue context,
> we must support injecting irqs from local interrupt
> context. Doing this seems to only require disabling
> irqs locally.
> 
> RFC, completely untested, x86 only.
> 
> Thoughts?
> 
We do not want to disable irqs for a long time and some of code paths
under lock involve looping over all cpus. For MSI injection path is
lockless and this is the only case that matters, so inject irqs from
local interrupt context if it is MSI or from workqueue otherwise.

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  arch/x86/kvm/i8259.c |   12 ++++++++++++
>  virt/kvm/eventfd.c   |   30 +++++++++---------------------
>  virt/kvm/ioapic.c    |   34 ++++++++++++++++++++--------------
>  3 files changed, 41 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> index 8d10c06..294fa36 100644
> --- a/arch/x86/kvm/i8259.c
> +++ b/arch/x86/kvm/i8259.c
> @@ -198,8 +198,10 @@ void kvm_pic_update_irq(struct kvm_pic *s)
>  int kvm_pic_set_irq(void *opaque, int irq, int level)
>  {
>  	struct kvm_pic *s = opaque;
> +	unsigned long flags;
>  	int ret = -1;
>  
> +	local_irq_save(flags);
>  	pic_lock(s);
>  	if (irq >= 0 && irq < PIC_NUM_PINS) {
>  		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
> @@ -208,6 +210,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
>  				      s->pics[irq >> 3].imr, ret == 0);
>  	}
>  	pic_unlock(s);
> +	local_irq_restore(flags);
>  
>  	return ret;
>  }
> @@ -235,8 +238,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
>  int kvm_pic_read_irq(struct kvm *kvm)
>  {
>  	int irq, irq2, intno;
> +	unsigned long flags;
>  	struct kvm_pic *s = pic_irqchip(kvm);
>  
> +	local_irq_save(flags);
>  	pic_lock(s);
>  	irq = pic_get_irq(&s->pics[0]);
>  	if (irq >= 0) {
> @@ -263,6 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
>  	}
>  	pic_update_irq(s);
>  	pic_unlock(s);
> +	local_irq_restore(flags);
>  
>  	return intno;
>  }
> @@ -476,6 +482,7 @@ static int picdev_write(struct kvm_io_device *this,
>  {
>  	struct kvm_pic *s = to_pic(this);
>  	unsigned char data = *(unsigned char *)val;
> +	unsigned long flags;
>  	if (!picdev_in_range(addr))
>  		return -EOPNOTSUPP;
>  
> @@ -484,6 +491,7 @@ static int picdev_write(struct kvm_io_device *this,
>  			printk(KERN_ERR "PIC: non byte write\n");
>  		return 0;
>  	}
> +	local_irq_save(flags);
>  	pic_lock(s);
>  	switch (addr) {
>  	case 0x20:
> @@ -498,6 +506,7 @@ static int picdev_write(struct kvm_io_device *this,
>  		break;
>  	}
>  	pic_unlock(s);
> +	local_irq_restore(flags);
>  	return 0;
>  }
>  
> @@ -506,6 +515,7 @@ static int picdev_read(struct kvm_io_device *this,
>  {
>  	struct kvm_pic *s = to_pic(this);
>  	unsigned char data = 0;
> +	unsigned long flags;
>  	if (!picdev_in_range(addr))
>  		return -EOPNOTSUPP;
>  
> @@ -514,6 +524,7 @@ static int picdev_read(struct kvm_io_device *this,
>  			printk(KERN_ERR "PIC: non byte read\n");
>  		return 0;
>  	}
> +	local_irq_save(flags);
>  	pic_lock(s);
>  	switch (addr) {
>  	case 0x20:
> @@ -529,6 +540,7 @@ static int picdev_read(struct kvm_io_device *this,
>  	}
>  	*(unsigned char *)val = data;
>  	pic_unlock(s);
> +	local_irq_restore(flags);
>  	return 0;
>  }
>  
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 66cf65b..30ede90 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -50,22 +50,11 @@ struct _irqfd {
>  	struct list_head          list;
>  	poll_table                pt;
>  	wait_queue_t              wait;
> -	struct work_struct        inject;
>  	struct work_struct        shutdown;
>  };
>  
>  static struct workqueue_struct *irqfd_cleanup_wq;
>  
> -static void
> -irqfd_inject(struct work_struct *work)
> -{
> -	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> -	struct kvm *kvm = irqfd->kvm;
> -
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> -}
> -
>  /*
>   * Race-free decouple logic (ordering is critical)
>   */
> @@ -82,12 +71,6 @@ irqfd_shutdown(struct work_struct *work)
>  	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
>  
>  	/*
> -	 * We know no new events will be scheduled at this point, so block
> -	 * until all previously outstanding events have completed
> -	 */
> -	flush_work(&irqfd->inject);
> -
> -	/*
>  	 * It is now safe to release the object's resources
>  	 */
>  	eventfd_ctx_put(irqfd->eventfd);
> @@ -128,7 +111,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>  
>  	if (flags & POLLIN)
>  		/* An event has been signaled, inject an interrupt */
> -		schedule_work(&irqfd->inject);
> +		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> +			    irqfd->gsi, 1);
>  
>  	if (flags & POLLHUP) {
>  		/* The eventfd is closing, detach from KVM */
> @@ -179,7 +163,6 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  	irqfd->kvm = kvm;
>  	irqfd->gsi = gsi;
>  	INIT_LIST_HEAD(&irqfd->list);
> -	INIT_WORK(&irqfd->inject, irqfd_inject);
>  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
>  
>  	file = eventfd_fget(fd);
> @@ -218,14 +201,19 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  	events = file->f_op->poll(file, &irqfd->pt);
>  
>  	list_add_tail(&irqfd->list, &kvm->irqfds.items);
> -	spin_unlock_irq(&kvm->irqfds.lock);
>  
>  	/*
>  	 * Check if there was an event already pending on the eventfd
>  	 * before we registered, and trigger it as if we didn't miss it.
> +	 *
> +	 * Do this under irqfds.lock, this way we can be sure
> +	 * the irqfd is not going away.
>  	 */
>  	if (events & POLLIN)
> -		schedule_work(&irqfd->inject);
> +		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> +			    irqfd->gsi, 1);
> +
> +	spin_unlock_irq(&kvm->irqfds.lock);
>  
>  	/*
>  	 * do not drop the file until the irqfd is fully initialized, otherwise
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index 0b9df83..cc1d2fe 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -197,8 +197,9 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
>  	u32 mask = 1 << irq;
>  	union kvm_ioapic_redirect_entry entry;
>  	int ret = 1;
> +	unsigned long flags;
>  
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	old_irr = ioapic->irr;
>  	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
>  		entry = ioapic->redirtbl[irq];
> @@ -216,7 +217,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
>  		}
>  		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
>  	}
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  
>  	return ret;
>  }
> @@ -240,9 +241,9 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
>  		 * is dropped it will be put into irr and will be delivered
>  		 * after ack notifier returns.
>  		 */
> -		spin_unlock(&ioapic->lock);
> +		spin_unlock_irqrestore(&ioapic->lock, flags);
>  		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
> -		spin_lock(&ioapic->lock);
> +		spin_lock_irqsave(&ioapic->lock, flags);
>  
>  		if (trigger_mode != IOAPIC_LEVEL_TRIG)
>  			continue;
> @@ -257,13 +258,14 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
>  void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
>  {
>  	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +	unsigned long flags;
>  
>  	smp_rmb();
>  	if (!test_bit(vector, ioapic->handled_vectors))
>  		return;
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  }
>  
>  static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
> @@ -281,6 +283,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
>  			    void *val)
>  {
>  	struct kvm_ioapic *ioapic = to_ioapic(this);
> +	unsigned long flags;
>  	u32 result;
>  	if (!ioapic_in_range(ioapic, addr))
>  		return -EOPNOTSUPP;
> @@ -289,7 +292,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
>  	ASSERT(!(addr & 0xf));	/* check alignment */
>  
>  	addr &= 0xff;
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	switch (addr) {
>  	case IOAPIC_REG_SELECT:
>  		result = ioapic->ioregsel;
> @@ -303,7 +306,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
>  		result = 0;
>  		break;
>  	}
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  
>  	switch (len) {
>  	case 8:
> @@ -324,6 +327,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
>  			     const void *val)
>  {
>  	struct kvm_ioapic *ioapic = to_ioapic(this);
> +	unsigned long flags;
>  	u32 data;
>  	if (!ioapic_in_range(ioapic, addr))
>  		return -EOPNOTSUPP;
> @@ -340,7 +344,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
>  	}
>  
>  	addr &= 0xff;
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	switch (addr) {
>  	case IOAPIC_REG_SELECT:
>  		ioapic->ioregsel = data;
> @@ -358,7 +362,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
>  	default:
>  		break;
>  	}
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  	return 0;
>  }
>  
> @@ -418,24 +422,26 @@ void kvm_ioapic_destroy(struct kvm *kvm)
>  int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
>  {
>  	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
> +	unsigned long flags;
>  	if (!ioapic)
>  		return -EINVAL;
>  
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  	return 0;
>  }
>  
>  int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
>  {
>  	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
> +	unsigned long flags;
>  	if (!ioapic)
>  		return -EINVAL;
>  
> -	spin_lock(&ioapic->lock);
> +	spin_lock_irqsave(&ioapic->lock, flags);
>  	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
>  	update_handled_vectors(ioapic);
> -	spin_unlock(&ioapic->lock);
> +	spin_unlock_irqrestore(&ioapic->lock, flags);
>  	return 0;
>  }
> -- 
> 1.7.3.rc1.5.ge5969

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:02 ` Gleb Natapov
@ 2010-09-16  9:10   ` Michael S. Tsirkin
  2010-09-16  9:25     ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16  9:10 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
> On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
> > To avoid bouncing irqfd injection out to workqueue context,
> > we must support injecting irqs from local interrupt
> > context. Doing this seems to only require disabling
> > irqs locally.
> > 
> > RFC, completely untested, x86 only.
> > 
> > Thoughts?
> > 
> We do not want to disable irqs for a long time and some of code paths
> under lock involve looping over all cpus. For MSI injection path is
> lockless and this is the only case that matters,

MSI only appeared in rhel6, older guests still use level interrupts.
Which paths require looping over all cpus? Do PCI interrupts
need this?

> so inject irqs from
> local interrupt context if it is MSI or from workqueue otherwise.

OK, but this would need a rework of irqfd code as it is currently unaware
of interrupt type.

> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  arch/x86/kvm/i8259.c |   12 ++++++++++++
> >  virt/kvm/eventfd.c   |   30 +++++++++---------------------
> >  virt/kvm/ioapic.c    |   34 ++++++++++++++++++++--------------
> >  3 files changed, 41 insertions(+), 35 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> > index 8d10c06..294fa36 100644
> > --- a/arch/x86/kvm/i8259.c
> > +++ b/arch/x86/kvm/i8259.c
> > @@ -198,8 +198,10 @@ void kvm_pic_update_irq(struct kvm_pic *s)
> >  int kvm_pic_set_irq(void *opaque, int irq, int level)
> >  {
> >  	struct kvm_pic *s = opaque;
> > +	unsigned long flags;
> >  	int ret = -1;
> >  
> > +	local_irq_save(flags);
> >  	pic_lock(s);
> >  	if (irq >= 0 && irq < PIC_NUM_PINS) {
> >  		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
> > @@ -208,6 +210,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
> >  				      s->pics[irq >> 3].imr, ret == 0);
> >  	}
> >  	pic_unlock(s);
> > +	local_irq_restore(flags);
> >  
> >  	return ret;
> >  }
> > @@ -235,8 +238,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
> >  int kvm_pic_read_irq(struct kvm *kvm)
> >  {
> >  	int irq, irq2, intno;
> > +	unsigned long flags;
> >  	struct kvm_pic *s = pic_irqchip(kvm);
> >  
> > +	local_irq_save(flags);
> >  	pic_lock(s);
> >  	irq = pic_get_irq(&s->pics[0]);
> >  	if (irq >= 0) {
> > @@ -263,6 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
> >  	}
> >  	pic_update_irq(s);
> >  	pic_unlock(s);
> > +	local_irq_restore(flags);
> >  
> >  	return intno;
> >  }
> > @@ -476,6 +482,7 @@ static int picdev_write(struct kvm_io_device *this,
> >  {
> >  	struct kvm_pic *s = to_pic(this);
> >  	unsigned char data = *(unsigned char *)val;
> > +	unsigned long flags;
> >  	if (!picdev_in_range(addr))
> >  		return -EOPNOTSUPP;
> >  
> > @@ -484,6 +491,7 @@ static int picdev_write(struct kvm_io_device *this,
> >  			printk(KERN_ERR "PIC: non byte write\n");
> >  		return 0;
> >  	}
> > +	local_irq_save(flags);
> >  	pic_lock(s);
> >  	switch (addr) {
> >  	case 0x20:
> > @@ -498,6 +506,7 @@ static int picdev_write(struct kvm_io_device *this,
> >  		break;
> >  	}
> >  	pic_unlock(s);
> > +	local_irq_restore(flags);
> >  	return 0;
> >  }
> >  
> > @@ -506,6 +515,7 @@ static int picdev_read(struct kvm_io_device *this,
> >  {
> >  	struct kvm_pic *s = to_pic(this);
> >  	unsigned char data = 0;
> > +	unsigned long flags;
> >  	if (!picdev_in_range(addr))
> >  		return -EOPNOTSUPP;
> >  
> > @@ -514,6 +524,7 @@ static int picdev_read(struct kvm_io_device *this,
> >  			printk(KERN_ERR "PIC: non byte read\n");
> >  		return 0;
> >  	}
> > +	local_irq_save(flags);
> >  	pic_lock(s);
> >  	switch (addr) {
> >  	case 0x20:
> > @@ -529,6 +540,7 @@ static int picdev_read(struct kvm_io_device *this,
> >  	}
> >  	*(unsigned char *)val = data;
> >  	pic_unlock(s);
> > +	local_irq_restore(flags);
> >  	return 0;
> >  }
> >  
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 66cf65b..30ede90 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -50,22 +50,11 @@ struct _irqfd {
> >  	struct list_head          list;
> >  	poll_table                pt;
> >  	wait_queue_t              wait;
> > -	struct work_struct        inject;
> >  	struct work_struct        shutdown;
> >  };
> >  
> >  static struct workqueue_struct *irqfd_cleanup_wq;
> >  
> > -static void
> > -irqfd_inject(struct work_struct *work)
> > -{
> > -	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> > -	struct kvm *kvm = irqfd->kvm;
> > -
> > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > -}
> > -
> >  /*
> >   * Race-free decouple logic (ordering is critical)
> >   */
> > @@ -82,12 +71,6 @@ irqfd_shutdown(struct work_struct *work)
> >  	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> >  
> >  	/*
> > -	 * We know no new events will be scheduled at this point, so block
> > -	 * until all previously outstanding events have completed
> > -	 */
> > -	flush_work(&irqfd->inject);
> > -
> > -	/*
> >  	 * It is now safe to release the object's resources
> >  	 */
> >  	eventfd_ctx_put(irqfd->eventfd);
> > @@ -128,7 +111,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> >  
> >  	if (flags & POLLIN)
> >  		/* An event has been signaled, inject an interrupt */
> > -		schedule_work(&irqfd->inject);
> > +		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> > +			    irqfd->gsi, 1);
> >  
> >  	if (flags & POLLHUP) {
> >  		/* The eventfd is closing, detach from KVM */
> > @@ -179,7 +163,6 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> >  	irqfd->kvm = kvm;
> >  	irqfd->gsi = gsi;
> >  	INIT_LIST_HEAD(&irqfd->list);
> > -	INIT_WORK(&irqfd->inject, irqfd_inject);
> >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> >  
> >  	file = eventfd_fget(fd);
> > @@ -218,14 +201,19 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> >  	events = file->f_op->poll(file, &irqfd->pt);
> >  
> >  	list_add_tail(&irqfd->list, &kvm->irqfds.items);
> > -	spin_unlock_irq(&kvm->irqfds.lock);
> >  
> >  	/*
> >  	 * Check if there was an event already pending on the eventfd
> >  	 * before we registered, and trigger it as if we didn't miss it.
> > +	 *
> > +	 * Do this under irqfds.lock, this way we can be sure
> > +	 * the irqfd is not going away.
> >  	 */
> >  	if (events & POLLIN)
> > -		schedule_work(&irqfd->inject);
> > +		kvm_set_irq(irqfd->kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> > +			    irqfd->gsi, 1);
> > +
> > +	spin_unlock_irq(&kvm->irqfds.lock);
> >  
> >  	/*
> >  	 * do not drop the file until the irqfd is fully initialized, otherwise
> > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> > index 0b9df83..cc1d2fe 100644
> > --- a/virt/kvm/ioapic.c
> > +++ b/virt/kvm/ioapic.c
> > @@ -197,8 +197,9 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
> >  	u32 mask = 1 << irq;
> >  	union kvm_ioapic_redirect_entry entry;
> >  	int ret = 1;
> > +	unsigned long flags;
> >  
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	old_irr = ioapic->irr;
> >  	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
> >  		entry = ioapic->redirtbl[irq];
> > @@ -216,7 +217,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
> >  		}
> >  		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
> >  	}
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  
> >  	return ret;
> >  }
> > @@ -240,9 +241,9 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
> >  		 * is dropped it will be put into irr and will be delivered
> >  		 * after ack notifier returns.
> >  		 */
> > -		spin_unlock(&ioapic->lock);
> > +		spin_unlock_irqrestore(&ioapic->lock, flags);
> >  		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
> > -		spin_lock(&ioapic->lock);
> > +		spin_lock_irqsave(&ioapic->lock, flags);
> >  
> >  		if (trigger_mode != IOAPIC_LEVEL_TRIG)
> >  			continue;
> > @@ -257,13 +258,14 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
> >  void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
> >  {
> >  	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > +	unsigned long flags;
> >  
> >  	smp_rmb();
> >  	if (!test_bit(vector, ioapic->handled_vectors))
> >  		return;
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  }
> >  
> >  static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
> > @@ -281,6 +283,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  			    void *val)
> >  {
> >  	struct kvm_ioapic *ioapic = to_ioapic(this);
> > +	unsigned long flags;
> >  	u32 result;
> >  	if (!ioapic_in_range(ioapic, addr))
> >  		return -EOPNOTSUPP;
> > @@ -289,7 +292,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  	ASSERT(!(addr & 0xf));	/* check alignment */
> >  
> >  	addr &= 0xff;
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	switch (addr) {
> >  	case IOAPIC_REG_SELECT:
> >  		result = ioapic->ioregsel;
> > @@ -303,7 +306,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  		result = 0;
> >  		break;
> >  	}
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  
> >  	switch (len) {
> >  	case 8:
> > @@ -324,6 +327,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
> >  			     const void *val)
> >  {
> >  	struct kvm_ioapic *ioapic = to_ioapic(this);
> > +	unsigned long flags;
> >  	u32 data;
> >  	if (!ioapic_in_range(ioapic, addr))
> >  		return -EOPNOTSUPP;
> > @@ -340,7 +344,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
> >  	}
> >  
> >  	addr &= 0xff;
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	switch (addr) {
> >  	case IOAPIC_REG_SELECT:
> >  		ioapic->ioregsel = data;
> > @@ -358,7 +362,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
> >  	default:
> >  		break;
> >  	}
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  	return 0;
> >  }
> >  
> > @@ -418,24 +422,26 @@ void kvm_ioapic_destroy(struct kvm *kvm)
> >  int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
> >  {
> >  	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
> > +	unsigned long flags;
> >  	if (!ioapic)
> >  		return -EINVAL;
> >  
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  	return 0;
> >  }
> >  
> >  int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
> >  {
> >  	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
> > +	unsigned long flags;
> >  	if (!ioapic)
> >  		return -EINVAL;
> >  
> > -	spin_lock(&ioapic->lock);
> > +	spin_lock_irqsave(&ioapic->lock, flags);
> >  	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
> >  	update_handled_vectors(ioapic);
> > -	spin_unlock(&ioapic->lock);
> > +	spin_unlock_irqrestore(&ioapic->lock, flags);
> >  	return 0;
> >  }
> > -- 
> > 1.7.3.rc1.5.ge5969
> 
> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:10   ` Michael S. Tsirkin
@ 2010-09-16  9:25     ` Gleb Natapov
  2010-09-16  9:43       ` Michael S. Tsirkin
  2010-09-16  9:46       ` Avi Kivity
  0 siblings, 2 replies; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16  9:25 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
> > On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
> > > To avoid bouncing irqfd injection out to workqueue context,
> > > we must support injecting irqs from local interrupt
> > > context. Doing this seems to only require disabling
> > > irqs locally.
> > > 
> > > RFC, completely untested, x86 only.
> > > 
> > > Thoughts?
> > > 
> > We do not want to disable irqs for a long time and some of code paths
> > under lock involve looping over all cpus. For MSI injection path is
> > lockless and this is the only case that matters,
> 
> MSI only appeared in rhel6, older guests still use level interrupts.
So they are already slow for other reasons.

> Which paths require looping over all cpus? Do PCI interrupts
> need this?
> 
All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt.
Pic has a loop to find cpu in virtual wire mode.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:25     ` Gleb Natapov
@ 2010-09-16  9:43       ` Michael S. Tsirkin
  2010-09-16  9:46       ` Avi Kivity
  1 sibling, 0 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16  9:43 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 11:25:53AM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
> > > On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
> > > > To avoid bouncing irqfd injection out to workqueue context,
> > > > we must support injecting irqs from local interrupt
> > > > context. Doing this seems to only require disabling
> > > > irqs locally.
> > > > 
> > > > RFC, completely untested, x86 only.
> > > > 
> > > > Thoughts?
> > > > 
> > > We do not want to disable irqs for a long time and some of code paths
> > > under lock involve looping over all cpus. For MSI injection path is
> > > lockless and this is the only case that matters,
> > 
> > MSI only appeared in rhel6, older guests still use level interrupts.
> So they are already slow for other reasons.

They are slower than MSI, but I do not want irqfd to be slower
than ioctls.

> > Which paths require looping over all cpus? Do PCI interrupts
> > need this?
> > 
> All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt.
> Pic has a loop to find cpu in virtual wire mode.
> 
> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:25     ` Gleb Natapov
  2010-09-16  9:43       ` Michael S. Tsirkin
@ 2010-09-16  9:46       ` Avi Kivity
  2010-09-16  9:53         ` Michael S. Tsirkin
  1 sibling, 1 reply; 37+ messages in thread
From: Avi Kivity @ 2010-09-16  9:46 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Michael S. Tsirkin, Marcelo Tosatti, kvm, linux-kernel

  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> >
> >  MSI only appeared in rhel6, older guests still use level interrupts.
> So they are already slow for other reasons.
>

Exactly, for example they need to exit to userspace to ack the 
interrupt.  That's far slower than the workqueue.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:46       ` Avi Kivity
@ 2010-09-16  9:53         ` Michael S. Tsirkin
  2010-09-16 10:13           ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16  9:53 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
>  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> >>
> >>  MSI only appeared in rhel6, older guests still use level interrupts.
> >So they are already slow for other reasons.
> >
> 
> Exactly, for example they need to exit to userspace to ack the
> interrupt.  That's far slower than the workqueue.

Well, this is not exactly comparable: you might get
same irq asserted multiple times and only deasserted once.


> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16  9:53         ` Michael S. Tsirkin
@ 2010-09-16 10:13           ` Gleb Natapov
  2010-09-16 10:13             ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 10:13 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > >>
> > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > >So they are already slow for other reasons.
> > >
> > 
> > Exactly, for example they need to exit to userspace to ack the
> > interrupt.  That's far slower than the workqueue.
> 
> Well, this is not exactly comparable: you might get
> same irq asserted multiple times and only deasserted once.
> 
Are we talking about level interrupts? Why would you assert level
triggered interrupt multiple times before deasserting it?

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:13           ` Gleb Natapov
@ 2010-09-16 10:13             ` Michael S. Tsirkin
  2010-09-16 10:20               ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 10:13 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > >>
> > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > >So they are already slow for other reasons.
> > > >
> > > 
> > > Exactly, for example they need to exit to userspace to ack the
> > > interrupt.  That's far slower than the workqueue.
> > 
> > Well, this is not exactly comparable: you might get
> > same irq asserted multiple times and only deasserted once.
> > 
> Are we talking about level interrupts? Why would you assert level
> triggered interrupt multiple times before deasserting it?

User of irqfd has no way to know what current interrupt level is.
So it has to keep asserting.

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:13             ` Michael S. Tsirkin
@ 2010-09-16 10:20               ` Gleb Natapov
  2010-09-16 10:44                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 10:20 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > >>
> > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > >So they are already slow for other reasons.
> > > > >
> > > > 
> > > > Exactly, for example they need to exit to userspace to ack the
> > > > interrupt.  That's far slower than the workqueue.
> > > 
> > > Well, this is not exactly comparable: you might get
> > > same irq asserted multiple times and only deasserted once.
> > > 
> > Are we talking about level interrupts? Why would you assert level
> > triggered interrupt multiple times before deasserting it?
> 
> User of irqfd has no way to know what current interrupt level is.
> So it has to keep asserting.
> 
Why can't it keep track of current level?

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:20               ` Gleb Natapov
@ 2010-09-16 10:44                 ` Michael S. Tsirkin
  2010-09-16 10:54                   ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 10:44 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > > >>
> > > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > > >So they are already slow for other reasons.
> > > > > >
> > > > > 
> > > > > Exactly, for example they need to exit to userspace to ack the
> > > > > interrupt.  That's far slower than the workqueue.
> > > > 
> > > > Well, this is not exactly comparable: you might get
> > > > same irq asserted multiple times and only deasserted once.
> > > > 
> > > Are we talking about level interrupts? Why would you assert level
> > > triggered interrupt multiple times before deasserting it?
> > 
> > User of irqfd has no way to know what current interrupt level is.
> > So it has to keep asserting.
> > 
> Why can't it keep track of current level?

This breaks the model: eventfd user is unaware of PCI, levels and such:
it just signals the event.  Remember that asserts are done from e.g. vhost-net,
deasserts need to be handled by qemu.

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:54                   ` Gleb Natapov
@ 2010-09-16 10:53                     ` Michael S. Tsirkin
  2010-09-16 11:17                       ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 10:53 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:54:03PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
> > > On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > > > > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > > > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > > > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > > > > >>
> > > > > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > > > > >So they are already slow for other reasons.
> > > > > > > >
> > > > > > > 
> > > > > > > Exactly, for example they need to exit to userspace to ack the
> > > > > > > interrupt.  That's far slower than the workqueue.
> > > > > > 
> > > > > > Well, this is not exactly comparable: you might get
> > > > > > same irq asserted multiple times and only deasserted once.
> > > > > > 
> > > > > Are we talking about level interrupts? Why would you assert level
> > > > > triggered interrupt multiple times before deasserting it?
> > > > 
> > > > User of irqfd has no way to know what current interrupt level is.
> > > > So it has to keep asserting.
> > > > 
> > > Why can't it keep track of current level?
> > 
> > This breaks the model: eventfd user is unaware of PCI, levels and such:
> > it just signals the event.  Remember that asserts are done from e.g. vhost-net,
> > deasserts need to be handled by qemu.
> > 
> eventfd user implements HW and it knows exactly what type of interrupt
> this HW generates.

We haver two users: qemu does deasserts, vhost-net does asserts.
Another application is out of process virtio (sandboxing!).
Again, pci stuff needs to stay in qemu.

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:44                 ` Michael S. Tsirkin
@ 2010-09-16 10:54                   ` Gleb Natapov
  2010-09-16 10:53                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 10:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > > > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > > > >>
> > > > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > > > >So they are already slow for other reasons.
> > > > > > >
> > > > > > 
> > > > > > Exactly, for example they need to exit to userspace to ack the
> > > > > > interrupt.  That's far slower than the workqueue.
> > > > > 
> > > > > Well, this is not exactly comparable: you might get
> > > > > same irq asserted multiple times and only deasserted once.
> > > > > 
> > > > Are we talking about level interrupts? Why would you assert level
> > > > triggered interrupt multiple times before deasserting it?
> > > 
> > > User of irqfd has no way to know what current interrupt level is.
> > > So it has to keep asserting.
> > > 
> > Why can't it keep track of current level?
> 
> This breaks the model: eventfd user is unaware of PCI, levels and such:
> it just signals the event.  Remember that asserts are done from e.g. vhost-net,
> deasserts need to be handled by qemu.
> 
eventfd user implements HW and it knows exactly what type of interrupt
this HW generates.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 10:53                     ` Michael S. Tsirkin
@ 2010-09-16 11:17                       ` Gleb Natapov
  2010-09-16 12:13                         ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 11:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 12:53:52PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 12:54:03PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
> > > > On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> > > > > On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > > > > > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > > > > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > > > > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > > > > > >>
> > > > > > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > > > > > >So they are already slow for other reasons.
> > > > > > > > >
> > > > > > > > 
> > > > > > > > Exactly, for example they need to exit to userspace to ack the
> > > > > > > > interrupt.  That's far slower than the workqueue.
> > > > > > > 
> > > > > > > Well, this is not exactly comparable: you might get
> > > > > > > same irq asserted multiple times and only deasserted once.
> > > > > > > 
> > > > > > Are we talking about level interrupts? Why would you assert level
> > > > > > triggered interrupt multiple times before deasserting it?
> > > > > 
> > > > > User of irqfd has no way to know what current interrupt level is.
> > > > > So it has to keep asserting.
> > > > > 
> > > > Why can't it keep track of current level?
> > > 
> > > This breaks the model: eventfd user is unaware of PCI, levels and such:
> > > it just signals the event.  Remember that asserts are done from e.g. vhost-net,
> > > deasserts need to be handled by qemu.
> > > 
> > eventfd user implements HW and it knows exactly what type of interrupt
> > this HW generates.
> 
> We haver two users: qemu does deasserts, vhost-net does asserts.
Well this is broken. You want KVM to track level for you and this is
wrong. KVM does this anyway because it can't relay on devise model
to behave correctly [0], but in your case it is designed to behave
incorrectly.

Interrupt type is a device property. PCI devices just happen to be level
triggered according to PCI spec. What if you want to use vhost-net to
implement network device which has active-low interrupt line? [1]

If you want to split parts that asserts irq and de-asserts it then we
should have irqfd that tracks line status and knows interrupt line
polarity.

> Another application is out of process virtio (sandboxing!).
It will still assert and de-assert irq at the same code, so it will be
able to track irq line status.

> Again, pci stuff needs to stay in qemu.
> 

Nothing to do with PCI whatsoever.

[0] most qemu devices behave incorrectly and trigger level irq more then
    needed.
[1] this is how correct PCI device should behave but we override
    polarity in ACPI, but now incorrect behaviour is deeply designed
    into vhost-net.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 11:17                       ` Gleb Natapov
@ 2010-09-16 12:13                         ` Michael S. Tsirkin
  2010-09-16 12:33                           ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 12:13 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 01:17:52PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 12:53:52PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 12:54:03PM +0200, Gleb Natapov wrote:
> > > On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
> > > > > On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
> > > > > > On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
> > > > > > > On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
> > > > > > > > On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
> > > > > > > > >  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
> > > > > > > > > >>
> > > > > > > > > >>  MSI only appeared in rhel6, older guests still use level interrupts.
> > > > > > > > > >So they are already slow for other reasons.
> > > > > > > > > >
> > > > > > > > > 
> > > > > > > > > Exactly, for example they need to exit to userspace to ack the
> > > > > > > > > interrupt.  That's far slower than the workqueue.
> > > > > > > > 
> > > > > > > > Well, this is not exactly comparable: you might get
> > > > > > > > same irq asserted multiple times and only deasserted once.
> > > > > > > > 
> > > > > > > Are we talking about level interrupts? Why would you assert level
> > > > > > > triggered interrupt multiple times before deasserting it?
> > > > > > 
> > > > > > User of irqfd has no way to know what current interrupt level is.
> > > > > > So it has to keep asserting.
> > > > > > 
> > > > > Why can't it keep track of current level?
> > > > 
> > > > This breaks the model: eventfd user is unaware of PCI, levels and such:
> > > > it just signals the event.  Remember that asserts are done from e.g. vhost-net,
> > > > deasserts need to be handled by qemu.
> > > > 
> > > eventfd user implements HW and it knows exactly what type of interrupt
> > > this HW generates.
> > 
> > We haver two users: qemu does deasserts, vhost-net does asserts.
> Well this is broken. You want KVM to track level for you and this is
> wrong. KVM does this anyway because it can't relay on devise model
> to behave correctly [0], but in your case it is designed to behave
> incorrectly.
> 
> Interrupt type is a device property. PCI devices just happen to be level
> triggered according to PCI spec. What if you want to use vhost-net to
> implement network device which has active-low interrupt line? [1]

The polarity would have to be reversed in gsi (irq line can be shared,
all devices must be active high or low consistently).

> If you want to split parts that asserts irq and de-asserts it then we
> should have irqfd that tracks line status and knows interrupt line
> polarity.

Yes, it can know about polarity even though I think it's cleaner to do this
per gsi. But it can not track line status as line is shared with
other devices.

> > Another application is out of process virtio (sandboxing!).
> It will still assert and de-assert irq at the same code, so it will be
> able to track irq line status.
> 
> > Again, pci stuff needs to stay in qemu.
> > 
> 
> Nothing to do with PCI whatsoever.
> 
> [0] most qemu devices behave incorrectly and trigger level irq more then
>     needed.

Which devices?
pci core tracks line status and will never assert the same
line multiple times.

> [1] this is how correct PCI device should behave but we override
>     polarity in ACPI, but now incorrect behaviour is deeply designed
>     into vhost-net.

Not really, vhost net signals an eventfd. What happens then is
up to kvm.

> --
> 			Gleb.




^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 12:13                         ` Michael S. Tsirkin
@ 2010-09-16 12:33                           ` Gleb Natapov
  2010-09-16 12:57                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 12:33 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > Well this is broken. You want KVM to track level for you and this is
> > wrong. KVM does this anyway because it can't relay on devise model
> > to behave correctly [0], but in your case it is designed to behave
> > incorrectly.
> > 
> > Interrupt type is a device property. PCI devices just happen to be level
> > triggered according to PCI spec. What if you want to use vhost-net to
> > implement network device which has active-low interrupt line? [1]
> 
> The polarity would have to be reversed in gsi (irq line can be shared,
> all devices must be active high or low consistently).
> 
There are gsi dedicated to PCI. They can be shared only between PCI
devices.

> > If you want to split parts that asserts irq and de-asserts it then we
> > should have irqfd that tracks line status and knows interrupt line
> > polarity.
> 
> Yes, it can know about polarity even though I think it's cleaner to do this
> per gsi. But it can not track line status as line is shared with
> other devices.
It should track only device's line status.

> 
> > > Another application is out of process virtio (sandboxing!).
> > It will still assert and de-assert irq at the same code, so it will be
> > able to track irq line status.
> > 
> > > Again, pci stuff needs to stay in qemu.
> > > 
> > 
> > Nothing to do with PCI whatsoever.
> > 
> > [0] most qemu devices behave incorrectly and trigger level irq more then
> >     needed.
> 
> Which devices?
Most of them. They just call update_irq_status() or something and
re-assert interrupt regardless of what previous status was.

> pci core tracks line status and will never assert the same
> line multiple times.
That's good if pci core does this, but device shouldn't even try it.

> 
> > [1] this is how correct PCI device should behave but we override
> >     polarity in ACPI, but now incorrect behaviour is deeply designed
> >     into vhost-net.
> 
> Not really, vhost net signals an eventfd. What happens then is
> up to kvm.
> 
That is what current broken design does and it works, but if you want to
save unneeded calls into kvm fix design.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 12:33                           ` Gleb Natapov
@ 2010-09-16 12:57                             ` Michael S. Tsirkin
  2010-09-16 13:14                               ` Avi Kivity
  2010-09-16 13:18                               ` Gleb Natapov
  0 siblings, 2 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 12:57 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > > Well this is broken. You want KVM to track level for you and this is
> > > wrong. KVM does this anyway because it can't relay on devise model
> > > to behave correctly [0], but in your case it is designed to behave
> > > incorrectly.
> > > 
> > > Interrupt type is a device property. PCI devices just happen to be level
> > > triggered according to PCI spec. What if you want to use vhost-net to
> > > implement network device which has active-low interrupt line? [1]
> > 
> > The polarity would have to be reversed in gsi (irq line can be shared,
> > all devices must be active high or low consistently).
> > 
> There are gsi dedicated to PCI. They can be shared only between PCI
> devices.
> 
> > > If you want to split parts that asserts irq and de-asserts it then we
> > > should have irqfd that tracks line status and knows interrupt line
> > > polarity.
> > 
> > Yes, it can know about polarity even though I think it's cleaner to do this
> > per gsi. But it can not track line status as line is shared with
> > other devices.
> It should track only device's line status.

There is no such thing as device's line status on real hardware, either.
Devices do not drive INT# high: they drive it low (all the time)
or do not drive it at all.

Or consider express, the spec explicitly says:
"Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
 are not errors."

> > 
> > > > Another application is out of process virtio (sandboxing!).
> > > It will still assert and de-assert irq at the same code, so it will be
> > > able to track irq line status.
> > > 
> > > > Again, pci stuff needs to stay in qemu.
> > > > 
> > > 
> > > Nothing to do with PCI whatsoever.
> > > 
> > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > >     needed.
> > 
> > Which devices?
> Most of them. They just call update_irq_status() or something and
> re-assert interrupt regardless of what previous status was.

At least for PCI devices, these calls do nothing if status does not change.

> > pci core tracks line status and will never assert the same
> > line multiple times.
> That's good if pci core does this, but device shouldn't even try it.

I disagree. We don't want to duplicate a ton of code all over
the codebase.

> > 
> > > [1] this is how correct PCI device should behave but we override
> > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > >     into vhost-net.
> > 
> > Not really, vhost net signals an eventfd. What happens then is
> > up to kvm.
> > 
> That is what current broken design does and it works, but if you want to
> save unneeded calls into kvm fix design.

The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
Making vhost aware of pci breaks this, I would not call that fixing the
design.

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 12:57                             ` Michael S. Tsirkin
@ 2010-09-16 13:14                               ` Avi Kivity
  2010-09-16 13:38                                 ` Michael S. Tsirkin
  2010-09-16 13:18                               ` Gleb Natapov
  1 sibling, 1 reply; 37+ messages in thread
From: Avi Kivity @ 2010-09-16 13:14 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

  On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:
> >
> >  >  >  If you want to split parts that asserts irq and de-asserts it then we
> >  >  >  should have irqfd that tracks line status and knows interrupt line
> >  >  >  polarity.
> >  >
> >  >  Yes, it can know about polarity even though I think it's cleaner to do this
> >  >  per gsi. But it can not track line status as line is shared with
> >  >  other devices.
> >  It should track only device's line status.
>
> There is no such thing as device's line status on real hardware, either.
> Devices do not drive INT# high: they drive it low (all the time)
> or do not drive it at all.
>

That's just an implementation detail.  Devices either assert INT# or 
they do not.  Tying the wires together constitutes an AND gate.  This 
gate has to be modelled somewhere, currently it's in qemu's pci emulation.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 12:57                             ` Michael S. Tsirkin
  2010-09-16 13:14                               ` Avi Kivity
@ 2010-09-16 13:18                               ` Gleb Natapov
  2010-09-16 13:51                                 ` Michael S. Tsirkin
  1 sibling, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 13:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > > > Well this is broken. You want KVM to track level for you and this is
> > > > wrong. KVM does this anyway because it can't relay on devise model
> > > > to behave correctly [0], but in your case it is designed to behave
> > > > incorrectly.
> > > > 
> > > > Interrupt type is a device property. PCI devices just happen to be level
> > > > triggered according to PCI spec. What if you want to use vhost-net to
> > > > implement network device which has active-low interrupt line? [1]
> > > 
> > > The polarity would have to be reversed in gsi (irq line can be shared,
> > > all devices must be active high or low consistently).
> > > 
> > There are gsi dedicated to PCI. They can be shared only between PCI
> > devices.
> > 
> > > > If you want to split parts that asserts irq and de-asserts it then we
> > > > should have irqfd that tracks line status and knows interrupt line
> > > > polarity.
> > > 
> > > Yes, it can know about polarity even though I think it's cleaner to do this
> > > per gsi. But it can not track line status as line is shared with
> > > other devices.
> > It should track only device's line status.
> 
> There is no such thing as device's line status on real hardware, either.
> Devices do not drive INT# high: they drive it low (all the time)
> or do not drive it at all.
Same thing, other naming. Device either drive it low (irq_set(1)) or
not (irq_set(0)).

> 
> Or consider express, the spec explicitly says:
> "Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
>  are not errors."
> 
> > > 
> > > > > Another application is out of process virtio (sandboxing!).
> > > > It will still assert and de-assert irq at the same code, so it will be
> > > > able to track irq line status.
> > > > 
> > > > > Again, pci stuff needs to stay in qemu.
> > > > > 
> > > > 
> > > > Nothing to do with PCI whatsoever.
> > > > 
> > > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > > >     needed.
> > > 
> > > Which devices?
> > Most of them. They just call update_irq_status() or something and
> > re-assert interrupt regardless of what previous status was.
> 
> At least for PCI devices, these calls do nothing if status does not change.
I am not sure what code are you locking at. e1000 device emulation
doesn't check previous line status before calling qemu_set_irq().

> 
> > > pci core tracks line status and will never assert the same
> > > line multiple times.
> > That's good if pci core does this, but device shouldn't even try it.
> 
> I disagree. We don't want to duplicate a ton of code all over
> the codebase.
> 
So abstract it into a function. It shouldn't be part of PCI emulation.

> > > 
> > > > [1] this is how correct PCI device should behave but we override
> > > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > > >     into vhost-net.
> > > 
> > > Not really, vhost net signals an eventfd. What happens then is
> > > up to kvm.
> > > 
> > That is what current broken design does and it works, but if you want to
> > save unneeded calls into kvm fix design.
> 
> The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
> Making vhost aware of pci breaks this, I would not call that fixing the
> design.
> 
Once again. Nothing to do with PCI, everything to do with device
emulation. And I propose to abstract interrupt assertion part into
irqfd, not into vhost.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 13:14                               ` Avi Kivity
@ 2010-09-16 13:38                                 ` Michael S. Tsirkin
  2010-09-16 13:50                                   ` Avi Kivity
  2010-09-16 13:55                                   ` Gleb Natapov
  0 siblings, 2 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 13:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote:
>  On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:
> >>
> >>  >  >  If you want to split parts that asserts irq and de-asserts it then we
> >>  >  >  should have irqfd that tracks line status and knows interrupt line
> >>  >  >  polarity.
> >>  >
> >>  >  Yes, it can know about polarity even though I think it's cleaner to do this
> >>  >  per gsi. But it can not track line status as line is shared with
> >>  >  other devices.
> >>  It should track only device's line status.
> >
> >There is no such thing as device's line status on real hardware, either.
> >Devices do not drive INT# high: they drive it low (all the time)
> >or do not drive it at all.
> >
> 
> That's just an implementation detail.  Devices either assert INT# or
> they do not.  Tying the wires together constitutes an AND gate.
> This gate has to be modelled somewhere, currently it's in qemu's pci
> emulation.

Right. kvm in kernel has this as well, we need to keep this in
kvm kernel if we want to support level with irqfd.
Where it does not belong is individual devices: these
should be able to assert INTx multiple times
and it should have no effect, as per spec.

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 13:38                                 ` Michael S. Tsirkin
@ 2010-09-16 13:50                                   ` Avi Kivity
  2010-09-16 13:55                                   ` Gleb Natapov
  1 sibling, 0 replies; 37+ messages in thread
From: Avi Kivity @ 2010-09-16 13:50 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

  On 09/16/2010 03:38 PM, Michael S. Tsirkin wrote:
> >
> >  That's just an implementation detail.  Devices either assert INT# or
> >  they do not.  Tying the wires together constitutes an AND gate.
> >  This gate has to be modelled somewhere, currently it's in qemu's pci
> >  emulation.
>
> Right. kvm in kernel has this as well, we need to keep this in
> kvm kernel if we want to support level with irqfd.

Right, we added the irq_source_id hack for device assignment.

What's wrong with letting userspace mediate this, though?

> Where it does not belong is individual devices: these
> should be able to assert INTx multiple times
> and it should have no effect, as per spec.
>
>

Yes.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 13:18                               ` Gleb Natapov
@ 2010-09-16 13:51                                 ` Michael S. Tsirkin
  2010-09-16 14:06                                   ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 13:51 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
> > > On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > > > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > > > > Well this is broken. You want KVM to track level for you and this is
> > > > > wrong. KVM does this anyway because it can't relay on devise model
> > > > > to behave correctly [0], but in your case it is designed to behave
> > > > > incorrectly.
> > > > > 
> > > > > Interrupt type is a device property. PCI devices just happen to be level
> > > > > triggered according to PCI spec. What if you want to use vhost-net to
> > > > > implement network device which has active-low interrupt line? [1]
> > > > 
> > > > The polarity would have to be reversed in gsi (irq line can be shared,
> > > > all devices must be active high or low consistently).
> > > > 
> > > There are gsi dedicated to PCI. They can be shared only between PCI
> > > devices.
> > > 
> > > > > If you want to split parts that asserts irq and de-asserts it then we
> > > > > should have irqfd that tracks line status and knows interrupt line
> > > > > polarity.
> > > > 
> > > > Yes, it can know about polarity even though I think it's cleaner to do this
> > > > per gsi. But it can not track line status as line is shared with
> > > > other devices.
> > > It should track only device's line status.
> > 
> > There is no such thing as device's line status on real hardware, either.
> > Devices do not drive INT# high: they drive it low (all the time)
> > or do not drive it at all.
> Same thing, other naming. Device either drive it low (irq_set(1)) or
> not (irq_set(0)).

Well, if I drive it low any number of times it should hae no effect.

> > 
> > Or consider express, the spec explicitly says:
> > "Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
> >  are not errors."
> > 
> > > > 
> > > > > > Another application is out of process virtio (sandboxing!).
> > > > > It will still assert and de-assert irq at the same code, so it will be
> > > > > able to track irq line status.
> > > > > 
> > > > > > Again, pci stuff needs to stay in qemu.
> > > > > > 
> > > > > 
> > > > > Nothing to do with PCI whatsoever.
> > > > > 
> > > > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > > > >     needed.
> > > > 
> > > > Which devices?
> > > Most of them. They just call update_irq_status() or something and
> > > re-assert interrupt regardless of what previous status was.
> > 
> > At least for PCI devices, these calls do nothing if status does not change.
> I am not sure what code are you locking at. e1000 device emulation
> doesn't check previous line status before calling qemu_set_irq().

Right. If you dig through useless levels of indirection, you will
see that each PCI device has 4 pin levels, when one of these
changes this makes it up level to the parent bus, and so on.

> > 
> > > > pci core tracks line status and will never assert the same
> > > > line multiple times.
> > > That's good if pci core does this, but device shouldn't even try it.
> > 
> > I disagree. We don't want to duplicate a ton of code all over
> > the codebase.
> > 
> So abstract it into a function. It shouldn't be part of PCI emulation.

I don't know what you mean by this, send a patch and we can discuss?
Note that when I patches PCI interrupt handling for compliance
I made it mimic hardware as closely as possible: devices
can send any # of assert/deassert messages, bus discards duplicates.

> > > > 
> > > > > [1] this is how correct PCI device should behave but we override
> > > > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > > > >     into vhost-net.
> > > > 
> > > > Not really, vhost net signals an eventfd. What happens then is
> > > > up to kvm.
> > > > 
> > > That is what current broken design does and it works, but if you want to
> > > save unneeded calls into kvm fix design.
> > 
> > The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
> > Making vhost aware of pci breaks this, I would not call that fixing the
> > design.
> > 
> Once again. Nothing to do with PCI, everything to do with device
> emulation. And I propose to abstract interrupt assertion part into
> irqfd, not into vhost.
> 
> --
> 			Gleb.

This could work. KVM would need to find all irqfd
objects mapped to gsi and notify them on deassert.
No idea how hard this is.


-- 
MST

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 13:38                                 ` Michael S. Tsirkin
  2010-09-16 13:50                                   ` Avi Kivity
@ 2010-09-16 13:55                                   ` Gleb Natapov
  1 sibling, 0 replies; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 13:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 03:38:28PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote:
> >  On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:
> > >>
> > >>  >  >  If you want to split parts that asserts irq and de-asserts it then we
> > >>  >  >  should have irqfd that tracks line status and knows interrupt line
> > >>  >  >  polarity.
> > >>  >
> > >>  >  Yes, it can know about polarity even though I think it's cleaner to do this
> > >>  >  per gsi. But it can not track line status as line is shared with
> > >>  >  other devices.
> > >>  It should track only device's line status.
> > >
> > >There is no such thing as device's line status on real hardware, either.
> > >Devices do not drive INT# high: they drive it low (all the time)
> > >or do not drive it at all.
> > >
> > 
> > That's just an implementation detail.  Devices either assert INT# or
> > they do not.  Tying the wires together constitutes an AND gate.
> > This gate has to be modelled somewhere, currently it's in qemu's pci
> > emulation.
> 
> Right. kvm in kernel has this as well, we need to keep this in
> kvm kernel if we want to support level with irqfd.
> Where it does not belong is individual devices: these
> should be able to assert INTx multiple times
> and it should have no effect, as per spec.
Assert_INTx/Deassert_INTx you mentioned are internal PCI thing. What KVM
sees logically is status of the line between pci controller and irq
chip. We do not emulate PCI inside kernel, but I agree that kernel should
handle multiple asserts without de-assert in the middle and, in fact, it does.
But the thread started with you trying to optimize this non-optimal
device behaviour and I am saying that the fix should be elsewhere. Namely in
irqfd.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 13:51                                 ` Michael S. Tsirkin
@ 2010-09-16 14:06                                   ` Gleb Natapov
  2010-09-16 14:23                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 14:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 03:51:37PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
> > > > On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > > > > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > > > > > Well this is broken. You want KVM to track level for you and this is
> > > > > > wrong. KVM does this anyway because it can't relay on devise model
> > > > > > to behave correctly [0], but in your case it is designed to behave
> > > > > > incorrectly.
> > > > > > 
> > > > > > Interrupt type is a device property. PCI devices just happen to be level
> > > > > > triggered according to PCI spec. What if you want to use vhost-net to
> > > > > > implement network device which has active-low interrupt line? [1]
> > > > > 
> > > > > The polarity would have to be reversed in gsi (irq line can be shared,
> > > > > all devices must be active high or low consistently).
> > > > > 
> > > > There are gsi dedicated to PCI. They can be shared only between PCI
> > > > devices.
> > > > 
> > > > > > If you want to split parts that asserts irq and de-asserts it then we
> > > > > > should have irqfd that tracks line status and knows interrupt line
> > > > > > polarity.
> > > > > 
> > > > > Yes, it can know about polarity even though I think it's cleaner to do this
> > > > > per gsi. But it can not track line status as line is shared with
> > > > > other devices.
> > > > It should track only device's line status.
> > > 
> > > There is no such thing as device's line status on real hardware, either.
> > > Devices do not drive INT# high: they drive it low (all the time)
> > > or do not drive it at all.
> > Same thing, other naming. Device either drive it low (irq_set(1)) or
> > not (irq_set(0)).
> 
> Well, if I drive it low any number of times it should hae no effect.
> 
There is no meaning of "driving low the line multiple times" on real HW.
You either drive it low or not. We try to emulate this with individual
"drive low/high" events in software.

> > > 
> > > Or consider express, the spec explicitly says:
> > > "Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
> > >  are not errors."
> > > 
> > > > > 
> > > > > > > Another application is out of process virtio (sandboxing!).
> > > > > > It will still assert and de-assert irq at the same code, so it will be
> > > > > > able to track irq line status.
> > > > > > 
> > > > > > > Again, pci stuff needs to stay in qemu.
> > > > > > > 
> > > > > > 
> > > > > > Nothing to do with PCI whatsoever.
> > > > > > 
> > > > > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > > > > >     needed.
> > > > > 
> > > > > Which devices?
> > > > Most of them. They just call update_irq_status() or something and
> > > > re-assert interrupt regardless of what previous status was.
> > > 
> > > At least for PCI devices, these calls do nothing if status does not change.
> > I am not sure what code are you locking at. e1000 device emulation
> > doesn't check previous line status before calling qemu_set_irq().
> 
> Right. If you dig through useless levels of indirection, you will
> see that each PCI device has 4 pin levels, when one of these
> changes this makes it up level to the parent bus, and so on.
Yes. Qemu PCI level does it right. Ideally device would not even invoke 
this logic if interrupt status haven't changed.

> 
> > > 
> > > > > pci core tracks line status and will never assert the same
> > > > > line multiple times.
> > > > That's good if pci core does this, but device shouldn't even try it.
> > > 
> > > I disagree. We don't want to duplicate a ton of code all over
> > > the codebase.
> > > 
> > So abstract it into a function. It shouldn't be part of PCI emulation.
> 
> I don't know what you mean by this, send a patch and we can discuss?
I don't care enough to send patch.  Just remember previous irq status
and do not call qemu_set_irq() if it doesn't change. Three lines of
code.

> Note that when I patches PCI interrupt handling for compliance
> I made it mimic hardware as closely as possible: devices
> can send any # of assert/deassert messages, bus discards duplicates.
> 
Qemu PCI code is correct as far as I can see. Not all devices are connected
via PCI and there is not need to go through couple of layer of
indirection to figure out that nothing should be done.


> > > > > 
> > > > > > [1] this is how correct PCI device should behave but we override
> > > > > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > > > > >     into vhost-net.
> > > > > 
> > > > > Not really, vhost net signals an eventfd. What happens then is
> > > > > up to kvm.
> > > > > 
> > > > That is what current broken design does and it works, but if you want to
> > > > save unneeded calls into kvm fix design.
> > > 
> > > The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
> > > Making vhost aware of pci breaks this, I would not call that fixing the
> > > design.
> > > 
> > Once again. Nothing to do with PCI, everything to do with device
> > emulation. And I propose to abstract interrupt assertion part into
> > irqfd, not into vhost.
> > 
> > --
> > 			Gleb.
> 
> This could work. KVM would need to find all irqfd
> objects mapped to gsi and notify them on deassert.
> No idea how hard this is.
> 
What for? Device emulation should do de-assert.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 14:06                                   ` Gleb Natapov
@ 2010-09-16 14:23                                     ` Michael S. Tsirkin
  2010-09-16 14:51                                       ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 14:23 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 04:06:15PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 03:51:37PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote:
> > > On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
> > > > > On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
> > > > > > > > We haver two users: qemu does deasserts, vhost-net does asserts.
> > > > > > > Well this is broken. You want KVM to track level for you and this is
> > > > > > > wrong. KVM does this anyway because it can't relay on devise model
> > > > > > > to behave correctly [0], but in your case it is designed to behave
> > > > > > > incorrectly.
> > > > > > > 
> > > > > > > Interrupt type is a device property. PCI devices just happen to be level
> > > > > > > triggered according to PCI spec. What if you want to use vhost-net to
> > > > > > > implement network device which has active-low interrupt line? [1]
> > > > > > 
> > > > > > The polarity would have to be reversed in gsi (irq line can be shared,
> > > > > > all devices must be active high or low consistently).
> > > > > > 
> > > > > There are gsi dedicated to PCI. They can be shared only between PCI
> > > > > devices.
> > > > > 
> > > > > > > If you want to split parts that asserts irq and de-asserts it then we
> > > > > > > should have irqfd that tracks line status and knows interrupt line
> > > > > > > polarity.
> > > > > > 
> > > > > > Yes, it can know about polarity even though I think it's cleaner to do this
> > > > > > per gsi. But it can not track line status as line is shared with
> > > > > > other devices.
> > > > > It should track only device's line status.
> > > > 
> > > > There is no such thing as device's line status on real hardware, either.
> > > > Devices do not drive INT# high: they drive it low (all the time)
> > > > or do not drive it at all.
> > > Same thing, other naming. Device either drive it low (irq_set(1)) or
> > > not (irq_set(0)).
> > 
> > Well, if I drive it low any number of times it should hae no effect.
> > 
> There is no meaning of "driving low the line multiple times" on real HW.
> You either drive it low or not. We try to emulate this with individual
> "drive low/high" events in software.
> > > > 
> > > > Or consider express, the spec explicitly says:
> > > > "Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but

Express has assert and deassert messages. It might be easier for
you to think in these terms. level 1: assert, level 0: deassert.
Seems a simple model and what we do models this pretty well.

> > > >  are not errors."
> > > > 
> > > > > > 
> > > > > > > > Another application is out of process virtio (sandboxing!).
> > > > > > > It will still assert and de-assert irq at the same code, so it will be
> > > > > > > able to track irq line status.
> > > > > > > 
> > > > > > > > Again, pci stuff needs to stay in qemu.
> > > > > > > > 
> > > > > > > 
> > > > > > > Nothing to do with PCI whatsoever.
> > > > > > > 
> > > > > > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > > > > > >     needed.
> > > > > > 
> > > > > > Which devices?
> > > > > Most of them. They just call update_irq_status() or something and
> > > > > re-assert interrupt regardless of what previous status was.
> > > > 
> > > > At least for PCI devices, these calls do nothing if status does not change.
> > > I am not sure what code are you locking at. e1000 device emulation
> > > doesn't check previous line status before calling qemu_set_irq().
> > 
> > Right. If you dig through useless levels of indirection, you will
> > see that each PCI device has 4 pin levels, when one of these
> > changes this makes it up level to the parent bus, and so on.
> Yes. Qemu PCI level does it right. Ideally device would not even invoke 
> this logic if interrupt status haven't changed.

It needs to call *some* function to check status
and assert, right? qemu_set_irq is that function.

> > 
> > > > 
> > > > > > pci core tracks line status and will never assert the same
> > > > > > line multiple times.
> > > > > That's good if pci core does this, but device shouldn't even try it.
> > > > 
> > > > I disagree. We don't want to duplicate a ton of code all over
> > > > the codebase.
> > > > 
> > > So abstract it into a function. It shouldn't be part of PCI emulation.
> > 
> > I don't know what you mean by this, send a patch and we can discuss?
> I don't care enough to send patch.  Just remember previous irq status
> and do not call qemu_set_irq() if it doesn't change. Three lines of
> code.

Heh, we have a ton of devices to support.
And then we need to migrate this extra status, and make sure it's in
sync with PCI code.  We'll end up with much more more than 3 lines all
of it in a very sensitive and hard to test parts code.

> > Note that when I patches PCI interrupt handling for compliance
> > I made it mimic hardware as closely as possible: devices
> > can send any # of assert/deassert messages, bus discards duplicates.
> > 
> Qemu PCI code is correct as far as I can see. Not all devices are connected
> via PCI and there is not need to go through couple of layer of
> indirection to figure out that nothing should be done.
> 

If we want to remove the indirection I would be much more
interested to remove it for all cases, not just when
nothing should be done.

> > > > > > 
> > > > > > > [1] this is how correct PCI device should behave but we override
> > > > > > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > > > > > >     into vhost-net.
> > > > > > 
> > > > > > Not really, vhost net signals an eventfd. What happens then is
> > > > > > up to kvm.
> > > > > > 
> > > > > That is what current broken design does and it works, but if you want to
> > > > > save unneeded calls into kvm fix design.
> > > > 
> > > > The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
> > > > Making vhost aware of pci breaks this, I would not call that fixing the
> > > > design.
> > > > 
> > > Once again. Nothing to do with PCI, everything to do with device
> > > emulation. And I propose to abstract interrupt assertion part into
> > > irqfd, not into vhost.
> > > 
> > > --
> > > 			Gleb.
> > 
> > This could work. KVM would need to find all irqfd
> > objects mapped to gsi and notify them on deassert.
> > No idea how hard this is.
> > 
> What for? Device emulation should do de-assert.

Sorry, but at this point I have no idea what you call device emulation.
qemu has code to de-assert. vhost has code to assert.
I would like to optimize level interrupts and stop driving
scheduler insane if at all possible.

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 14:23                                     ` Michael S. Tsirkin
@ 2010-09-16 14:51                                       ` Gleb Natapov
  2010-09-16 15:24                                         ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 14:51 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 04:23:35PM +0200, Michael S. Tsirkin wrote:
> > > > > There is no such thing as device's line status on real hardware, either.
> > > > > Devices do not drive INT# high: they drive it low (all the time)
> > > > > or do not drive it at all.
> > > > Same thing, other naming. Device either drive it low (irq_set(1)) or
> > > > not (irq_set(0)).
> > > 
> > > Well, if I drive it low any number of times it should hae no effect.
> > > 
> > There is no meaning of "driving low the line multiple times" on real HW.
> > You either drive it low or not. We try to emulate this with individual
> > "drive low/high" events in software.
> > > > > 
> > > > > Or consider express, the spec explicitly says:
> > > > > "Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
> 
> Express has assert and deassert messages. It might be easier for
> you to think in these terms. level 1: assert, level 0: deassert.
> Seems a simple model and what we do models this pretty well.
> 
They are between device and pci controller. I am talking about what
happens between pci controller and irq chip. We are talking about
different things really.

> > > > >  are not errors."
> > > > > 
> > > > > > > 
> > > > > > > > > Another application is out of process virtio (sandboxing!).
> > > > > > > > It will still assert and de-assert irq at the same code, so it will be
> > > > > > > > able to track irq line status.
> > > > > > > > 
> > > > > > > > > Again, pci stuff needs to stay in qemu.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Nothing to do with PCI whatsoever.
> > > > > > > > 
> > > > > > > > [0] most qemu devices behave incorrectly and trigger level irq more then
> > > > > > > >     needed.
> > > > > > > 
> > > > > > > Which devices?
> > > > > > Most of them. They just call update_irq_status() or something and
> > > > > > re-assert interrupt regardless of what previous status was.
> > > > > 
> > > > > At least for PCI devices, these calls do nothing if status does not change.
> > > > I am not sure what code are you locking at. e1000 device emulation
> > > > doesn't check previous line status before calling qemu_set_irq().
> > > 
> > > Right. If you dig through useless levels of indirection, you will
> > > see that each PCI device has 4 pin levels, when one of these
> > > changes this makes it up level to the parent bus, and so on.
> > Yes. Qemu PCI level does it right. Ideally device would not even invoke 
> > this logic if interrupt status haven't changed.
> 
> It needs to call *some* function to check status
> and assert, right? qemu_set_irq is that function.
qemu_set_irq does not check previous level and calls a callback
unconditionally.

> 
> > > 
> > > > > 
> > > > > > > pci core tracks line status and will never assert the same
> > > > > > > line multiple times.
> > > > > > That's good if pci core does this, but device shouldn't even try it.
> > > > > 
> > > > > I disagree. We don't want to duplicate a ton of code all over
> > > > > the codebase.
> > > > > 
> > > > So abstract it into a function. It shouldn't be part of PCI emulation.
> > > 
> > > I don't know what you mean by this, send a patch and we can discuss?
> > I don't care enough to send patch.  Just remember previous irq status
> > and do not call qemu_set_irq() if it doesn't change. Three lines of
> > code.
> 
> Heh, we have a ton of devices to support.
So?

> And then we need to migrate this extra status, and make sure it's in
> sync with PCI code.  We'll end up with much more more than 3 lines all
> of it in a very sensitive and hard to test parts code.
> 
You should be able to reconstruct it from device state. What should be
in sync with PCI code? 

> > > Note that when I patches PCI interrupt handling for compliance
> > > I made it mimic hardware as closely as possible: devices
> > > can send any # of assert/deassert messages, bus discards duplicates.
> > > 
> > Qemu PCI code is correct as far as I can see. Not all devices are connected
> > via PCI and there is not need to go through couple of layer of
> > indirection to figure out that nothing should be done.
> > 
> 
> If we want to remove the indirection I would be much more
> interested to remove it for all cases, not just when
> nothing should be done.
I don't care. This indirection may be justified for all I know. You try to
shift this discussion to areas I am not interested to look into :) All I
am saying is that each device is capable of knowing its current irq line
state and optimize out function call + additional logic. Whether upper
layer should handle two asserts without de-assert in between is different
point and I think we agree on it.

> 
> > > > > > > 
> > > > > > > > [1] this is how correct PCI device should behave but we override
> > > > > > > >     polarity in ACPI, but now incorrect behaviour is deeply designed
> > > > > > > >     into vhost-net.
> > > > > > > 
> > > > > > > Not really, vhost net signals an eventfd. What happens then is
> > > > > > > up to kvm.
> > > > > > > 
> > > > > > That is what current broken design does and it works, but if you want to
> > > > > > save unneeded calls into kvm fix design.
> > > > > 
> > > > > The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci.
> > > > > Making vhost aware of pci breaks this, I would not call that fixing the
> > > > > design.
> > > > > 
> > > > Once again. Nothing to do with PCI, everything to do with device
> > > > emulation. And I propose to abstract interrupt assertion part into
> > > > irqfd, not into vhost.
> > > > 
> > > > --
> > > > 			Gleb.
> > > 
> > > This could work. KVM would need to find all irqfd
> > > objects mapped to gsi and notify them on deassert.
> > > No idea how hard this is.
> > > 
> > What for? Device emulation should do de-assert.
> 
> Sorry, but at this point I have no idea what you call device emulation.
The same thing everyone calls device emulation. In case of virtio-net it
is in hw/virtio-net.c. If vhost-net is in use device emulation is split
between userspace and kernel, but it is still just device emulation.

> qemu has code to de-assert. vhost has code to assert.
Good. So qemu will de-assert. So what do you mean by
"KVM would need to find all irqfd objects mapped to gsi and notify
them on deassert"

> I would like to optimize level interrupts and stop driving
> scheduler insane if at all possible.
> 
Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
level hasn't changed.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 14:51                                       ` Gleb Natapov
@ 2010-09-16 15:24                                         ` Michael S. Tsirkin
  2010-09-16 15:43                                           ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 15:24 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
> > > What for? Device emulation should do de-assert.
> > 
> > Sorry, but at this point I have no idea what you call device emulation.
> The same thing everyone calls device emulation. In case of virtio-net it
> is in hw/virtio-net.c. If vhost-net is in use device emulation is split
> between userspace and kernel, but it is still just device emulation.


case in point, virtio net does not know about pci at all,
so it can not deassert.

> > qemu has code to de-assert. vhost has code to assert.
> Good. So qemu will de-assert. So what do you mean by
> "KVM would need to find all irqfd objects mapped to gsi and notify
> them on deassert"
> 
> > I would like to optimize level interrupts and stop driving
> > scheduler insane if at all possible.
> > 
> Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
> level hasn't changed.

Right. Then it needs to know about deasserts. It does not get this
information, so when kvm gets deassert ioctl it should
locate irqfd object and set level to 0.
See?

-- 
MST


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 15:24                                         ` Michael S. Tsirkin
@ 2010-09-16 15:43                                           ` Gleb Natapov
  2010-09-16 22:07                                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-16 15:43 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 05:24:11PM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
> > > > What for? Device emulation should do de-assert.
> > > 
> > > Sorry, but at this point I have no idea what you call device emulation.
> > The same thing everyone calls device emulation. In case of virtio-net it
> > is in hw/virtio-net.c. If vhost-net is in use device emulation is split
> > between userspace and kernel, but it is still just device emulation.
> 
> 
> case in point, virtio net does not know about pci at all,
> so it can not deassert.
> 
I don't see what PCI has to do with it. virtio-net device implementation is
split between several files. virtio-net/virtio-ring/virtio-pci. All of
them still implement one emulated device. Whoever implemented virtio-net
initially decided to put irq ack register into PCI config space, so code
that does de-assert is in virtio-pci, but it is still part of emulated
device.

> > > qemu has code to de-assert. vhost has code to assert.
> > Good. So qemu will de-assert. So what do you mean by
> > "KVM would need to find all irqfd objects mapped to gsi and notify
> > them on deassert"
> > 
> > > I would like to optimize level interrupts and stop driving
> > > scheduler insane if at all possible.
> > > 
> > Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
> > level hasn't changed.
> 
> Right. Then it needs to know about deasserts. It does not get this
Dessert should be done by qemu writing 0 into irqfd. Assertion and
de-assertion should be done through irqfd.

> information, so when kvm gets deassert ioctl it should
> locate irqfd object and set level to 0.
> See?
No.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 15:43                                           ` Gleb Natapov
@ 2010-09-16 22:07                                             ` Michael S. Tsirkin
  2010-09-17  7:59                                               ` Gleb Natapov
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-16 22:07 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Thu, Sep 16, 2010 at 05:43:26PM +0200, Gleb Natapov wrote:
> On Thu, Sep 16, 2010 at 05:24:11PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
> > > > > What for? Device emulation should do de-assert.
> > > > 
> > > > Sorry, but at this point I have no idea what you call device emulation.
> > > The same thing everyone calls device emulation. In case of virtio-net it
> > > is in hw/virtio-net.c. If vhost-net is in use device emulation is split
> > > between userspace and kernel, but it is still just device emulation.
> > 
> > 
> > case in point, virtio net does not know about pci at all,
> > so it can not deassert.
> > 
> I don't see what PCI has to do with it. virtio-net device implementation is
> split between several files. virtio-net/virtio-ring/virtio-pci. All of
> them still implement one emulated device. Whoever implemented virtio-net
> initially decided to put irq ack register into PCI config space, so code
> that does de-assert is in virtio-pci,
> but it is still part of emulated
> device.
> 
> > > > qemu has code to de-assert. vhost has code to assert.
> > > Good. So qemu will de-assert. So what do you mean by
> > > "KVM would need to find all irqfd objects mapped to gsi and notify
> > > them on deassert"
> > > 
> > > > I would like to optimize level interrupts and stop driving
> > > > scheduler insane if at all possible.
> > > > 
> > > Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
> > > level hasn't changed.
> > 
> > Right. Then it needs to know about deasserts. It does not get this
> Dessert should be done by qemu writing 0 into irqfd.

writing 0 to eventfd does nothing. The way to deassert irq
is currently through ioctl and it seems entirely sensible
to me to keep it that way.

Also - which irqfd :)
We need multiple irqfds to map to a single gsi, with kvm
doing OR on them.

> Assertion and
> de-assertion should be done through irqfd.
> > information, so when kvm gets deassert ioctl it should
> > locate irqfd object and set level to 0.
> > See?
> No.
> 
> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-16 22:07                                             ` Michael S. Tsirkin
@ 2010-09-17  7:59                                               ` Gleb Natapov
  2010-09-19 10:45                                                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-17  7:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Fri, Sep 17, 2010 at 12:07:15AM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 05:43:26PM +0200, Gleb Natapov wrote:
> > On Thu, Sep 16, 2010 at 05:24:11PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
> > > > > > What for? Device emulation should do de-assert.
> > > > > 
> > > > > Sorry, but at this point I have no idea what you call device emulation.
> > > > The same thing everyone calls device emulation. In case of virtio-net it
> > > > is in hw/virtio-net.c. If vhost-net is in use device emulation is split
> > > > between userspace and kernel, but it is still just device emulation.
> > > 
> > > 
> > > case in point, virtio net does not know about pci at all,
> > > so it can not deassert.
> > > 
> > I don't see what PCI has to do with it. virtio-net device implementation is
> > split between several files. virtio-net/virtio-ring/virtio-pci. All of
> > them still implement one emulated device. Whoever implemented virtio-net
> > initially decided to put irq ack register into PCI config space, so code
> > that does de-assert is in virtio-pci,
> > but it is still part of emulated
> > device.
> > 
> > > > > qemu has code to de-assert. vhost has code to assert.
> > > > Good. So qemu will de-assert. So what do you mean by
> > > > "KVM would need to find all irqfd objects mapped to gsi and notify
> > > > them on deassert"
> > > > 
> > > > > I would like to optimize level interrupts and stop driving
> > > > > scheduler insane if at all possible.
> > > > > 
> > > > Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
> > > > level hasn't changed.
> > > 
> > > Right. Then it needs to know about deasserts. It does not get this
> > Dessert should be done by qemu writing 0 into irqfd.
> 
> writing 0 to eventfd does nothing. The way to deassert irq
That is implementation detail of current irqfd. It was designed for MSI
not level triggered interrupts.

> is currently through ioctl and it seems entirely sensible
> to me to keep it that way.
Then do not complain.

> 
> Also - which irqfd :)
> We need multiple irqfds to map to a single gsi, with kvm
> doing OR on them.
Forget about gsi for now. Care only about your device state. KVM will
keep track of sharing.  Although having irqfd per gsi and keep tracking
there in not a bad idea.

> 
> > Assertion and
> > de-assertion should be done through irqfd.
> > > information, so when kvm gets deassert ioctl it should
> > > locate irqfd object and set level to 0.
> > > See?
> > No.
> > 
> > --
> > 			Gleb.

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-17  7:59                                               ` Gleb Natapov
@ 2010-09-19 10:45                                                 ` Michael S. Tsirkin
  2010-09-19 10:56                                                   ` Avi Kivity
  0 siblings, 1 reply; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 10:45 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> > writing 0 to eventfd does nothing. The way to deassert irq
> That is implementation detail of current irqfd. It was designed for MSI
> not level triggered interrupts.

Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
Level which switches to 1 and back to 0 immediately will be racy anyway
...
Avi?

-- 
MST

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 10:56                                                   ` Avi Kivity
@ 2010-09-19 10:55                                                     ` Michael S. Tsirkin
  2010-09-19 11:05                                                       ` Avi Kivity
  2010-09-19 11:18                                                       ` Gleb Natapov
  0 siblings, 2 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 10:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

On Sun, Sep 19, 2010 at 12:56:12PM +0200, Avi Kivity wrote:
>  On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> >On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> >>  >  writing 0 to eventfd does nothing. The way to deassert irq
> >>  That is implementation detail of current irqfd. It was designed for MSI
> >>  not level triggered interrupts.
> >
> >Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> >Level which switches to 1 and back to 0 immediately will be racy anyway
> >...
> >
> 
> Add a check where?

I would make sure that if you bind irqfd to a non-MSI GSI,
signalling it has no effect.

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 10:45                                                 ` Michael S. Tsirkin
@ 2010-09-19 10:56                                                   ` Avi Kivity
  2010-09-19 10:55                                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 37+ messages in thread
From: Avi Kivity @ 2010-09-19 10:56 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

  On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> >  >  writing 0 to eventfd does nothing. The way to deassert irq
> >  That is implementation detail of current irqfd. It was designed for MSI
> >  not level triggered interrupts.
>
> Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> Level which switches to 1 and back to 0 immediately will be racy anyway
> ...
>

Add a check where?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 10:55                                                     ` Michael S. Tsirkin
@ 2010-09-19 11:05                                                       ` Avi Kivity
  2010-09-19 11:23                                                         ` Michael S. Tsirkin
  2010-09-19 11:18                                                       ` Gleb Natapov
  1 sibling, 1 reply; 37+ messages in thread
From: Avi Kivity @ 2010-09-19 11:05 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

  On 09/19/2010 12:55 PM, Michael S. Tsirkin wrote:
> On Sun, Sep 19, 2010 at 12:56:12PM +0200, Avi Kivity wrote:
> >   On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> >  >On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> >  >>   >   writing 0 to eventfd does nothing. The way to deassert irq
> >  >>   That is implementation detail of current irqfd. It was designed for MSI
> >  >>   not level triggered interrupts.
> >  >
> >  >Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> >  >Level which switches to 1 and back to 0 immediately will be racy anyway
> >  >...
> >  >
> >
> >  Add a check where?
>
> I would make sure that if you bind irqfd to a non-MSI GSI,
> signalling it has no effect.
>

Certainly.  We should also update the documentation, when we have irqfd 
documentation.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 10:55                                                     ` Michael S. Tsirkin
  2010-09-19 11:05                                                       ` Avi Kivity
@ 2010-09-19 11:18                                                       ` Gleb Natapov
  2010-09-19 11:21                                                         ` Michael S. Tsirkin
  1 sibling, 1 reply; 37+ messages in thread
From: Gleb Natapov @ 2010-09-19 11:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Sun, Sep 19, 2010 at 12:55:12PM +0200, Michael S. Tsirkin wrote:
> On Sun, Sep 19, 2010 at 12:56:12PM +0200, Avi Kivity wrote:
> >  On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> > >On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> > >>  >  writing 0 to eventfd does nothing. The way to deassert irq
> > >>  That is implementation detail of current irqfd. It was designed for MSI
> > >>  not level triggered interrupts.
> > >
> > >Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> > >Level which switches to 1 and back to 0 immediately will be racy anyway
> > >...
> > >
> > 
> > Add a check where?
> 
> I would make sure that if you bind irqfd to a non-MSI GSI,
> signalling it has no effect.
> 
Why would you do that? I am not against checking per se, but why a user of
the API can't check for that?

--
			Gleb.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 11:18                                                       ` Gleb Natapov
@ 2010-09-19 11:21                                                         ` Michael S. Tsirkin
  0 siblings, 0 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 11:21 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Avi Kivity, Marcelo Tosatti, kvm, linux-kernel

On Sun, Sep 19, 2010 at 01:18:44PM +0200, Gleb Natapov wrote:
> On Sun, Sep 19, 2010 at 12:55:12PM +0200, Michael S. Tsirkin wrote:
> > On Sun, Sep 19, 2010 at 12:56:12PM +0200, Avi Kivity wrote:
> > >  On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> > > >On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> > > >>  >  writing 0 to eventfd does nothing. The way to deassert irq
> > > >>  That is implementation detail of current irqfd. It was designed for MSI
> > > >>  not level triggered interrupts.
> > > >
> > > >Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> > > >Level which switches to 1 and back to 0 immediately will be racy anyway
> > > >...
> > > >
> > > 
> > > Add a check where?
> > 
> > I would make sure that if you bind irqfd to a non-MSI GSI,
> > signalling it has no effect.
> > 
> Why would you do that? I am not against checking per se, but why a user of
> the API can't check for that?
> 
> --
> 			Gleb.

Currently using irqfd with level will kind of work, sometimes.
Better have it consistently not working so application
writers get the message that they can't rely on it.

-- 
MST

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC] kvm: enable irq injection from interrupt context
  2010-09-19 11:05                                                       ` Avi Kivity
@ 2010-09-19 11:23                                                         ` Michael S. Tsirkin
  0 siblings, 0 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 11:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Gleb Natapov, Marcelo Tosatti, kvm, linux-kernel

On Sun, Sep 19, 2010 at 01:05:09PM +0200, Avi Kivity wrote:
>  On 09/19/2010 12:55 PM, Michael S. Tsirkin wrote:
> >On Sun, Sep 19, 2010 at 12:56:12PM +0200, Avi Kivity wrote:
> >>   On 09/19/2010 12:45 PM, Michael S. Tsirkin wrote:
> >>  >On Fri, Sep 17, 2010 at 09:59:29AM +0200, Gleb Natapov wrote:
> >>  >>   >   writing 0 to eventfd does nothing. The way to deassert irq
> >>  >>   That is implementation detail of current irqfd. It was designed for MSI
> >>  >>   not level triggered interrupts.
> >>  >
> >>  >Maybe we should add a check that gsi is mapped to MSI (or unmapped) then?
> >>  >Level which switches to 1 and back to 0 immediately will be racy anyway
> >>  >...
> >>  >
> >>
> >>  Add a check where?
> >
> >I would make sure that if you bind irqfd to a non-MSI GSI,
> >signalling it has no effect.
> >
> 
> Certainly.  We should also update the documentation, when we have
> irqfd documentation.

When and if we start supporing level, we'll have to add a capability flag.
By way of documentation, reserve a bit value in header right now?

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2010-09-19 11:29 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-15 18:54 [PATCH RFC] kvm: enable irq injection from interrupt context Michael S. Tsirkin
2010-09-16  9:02 ` Gleb Natapov
2010-09-16  9:10   ` Michael S. Tsirkin
2010-09-16  9:25     ` Gleb Natapov
2010-09-16  9:43       ` Michael S. Tsirkin
2010-09-16  9:46       ` Avi Kivity
2010-09-16  9:53         ` Michael S. Tsirkin
2010-09-16 10:13           ` Gleb Natapov
2010-09-16 10:13             ` Michael S. Tsirkin
2010-09-16 10:20               ` Gleb Natapov
2010-09-16 10:44                 ` Michael S. Tsirkin
2010-09-16 10:54                   ` Gleb Natapov
2010-09-16 10:53                     ` Michael S. Tsirkin
2010-09-16 11:17                       ` Gleb Natapov
2010-09-16 12:13                         ` Michael S. Tsirkin
2010-09-16 12:33                           ` Gleb Natapov
2010-09-16 12:57                             ` Michael S. Tsirkin
2010-09-16 13:14                               ` Avi Kivity
2010-09-16 13:38                                 ` Michael S. Tsirkin
2010-09-16 13:50                                   ` Avi Kivity
2010-09-16 13:55                                   ` Gleb Natapov
2010-09-16 13:18                               ` Gleb Natapov
2010-09-16 13:51                                 ` Michael S. Tsirkin
2010-09-16 14:06                                   ` Gleb Natapov
2010-09-16 14:23                                     ` Michael S. Tsirkin
2010-09-16 14:51                                       ` Gleb Natapov
2010-09-16 15:24                                         ` Michael S. Tsirkin
2010-09-16 15:43                                           ` Gleb Natapov
2010-09-16 22:07                                             ` Michael S. Tsirkin
2010-09-17  7:59                                               ` Gleb Natapov
2010-09-19 10:45                                                 ` Michael S. Tsirkin
2010-09-19 10:56                                                   ` Avi Kivity
2010-09-19 10:55                                                     ` Michael S. Tsirkin
2010-09-19 11:05                                                       ` Avi Kivity
2010-09-19 11:23                                                         ` Michael S. Tsirkin
2010-09-19 11:18                                                       ` Gleb Natapov
2010-09-19 11:21                                                         ` Michael S. Tsirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.