All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] interrupt mitigation for e1000
@ 2012-07-24 16:58 Luigi Rizzo
  2012-07-25  7:47 ` Stefan Hajnoczi
  2012-07-25  8:53 ` Avi Kivity
  0 siblings, 2 replies; 7+ messages in thread
From: Luigi Rizzo @ 2012-07-24 16:58 UTC (permalink / raw)
  To: qemu-devel

I noticed that the various NIC modules in qemu/kvm do not implement
interrupt mitigation, which is very beneficial as it dramatically
reduces exits from the hypervisor.

As a proof of concept i tried to implement it for the e1000 driver
(patch below), and it brings tx performance from 9 to 56Kpps on
qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.

I am going to measure the rx interrupt mitigation in the next couple
of days.

Is there any interest in having this code in ?

	cheers
	luigi

diff -ubwrp --exclude '*.[do]' /tmp/qemu-61dc008/hw/e1000.c ./hw/e1000.c
--- /tmp/qemu-61dc008/hw/e1000.c	2012-07-20 01:25:52.000000000 +0200
+++ ./hw/e1000.c	2012-07-24 18:21:39.000000000 +0200
@@ -33,6 +33,8 @@
 #include "sysemu.h"
 #include "dma.h"
 
+#define MITIGATION
+
 #include "e1000_hw.h"
 
 #define E1000_DEBUG
@@ -127,6 +129,13 @@ typedef struct E1000State_st {
     } eecd_state;
 
     QEMUTimer *autoneg_timer;
+
+#ifdef MITIGATION
+    QEMUBH *int_bh;	// interrupt mitigation handler
+    int tx_ics_count;	// pending tx int requests
+    int rx_ics_count;	// pending rx int requests
+    int int_cause;	// int cause
+#endif // MITIGATION
 } E1000State;
 
 #define	defreg(x)	x = (E1000_##x>>2)
@@ -638,6 +648,26 @@ start_xmit(E1000State *s)
         return;
     }
 
+#ifdef MITIGATION
+    /* we transmit the first few packets, or we do if we are
+     * approaching a full ring. in the latter case, also
+     * send an ics.
+     * 
+     */
+{
+    int len, pending;
+    len = s->mac_reg[TDLEN] / sizeof(desc) ;
+    pending = s->mac_reg[TDT] - s->mac_reg[TDH];
+    if (pending < 0)
+	pending += len;
+    /* ignore requests after the first few ones, as long as
+     * we are not approaching a full ring.
+     * Otherwise, deliver packets to the backend.
+     */
+    if (s->tx_ics_count > 4 && s->tx_ics_count + pending < len - 5)
+	return;
+#endif // MITIGATION
+
     while (s->mac_reg[TDH] != s->mac_reg[TDT]) {
         base = tx_desc_base(s) +
                sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];
@@ -663,7 +693,21 @@ start_xmit(E1000State *s)
             break;
         }
     }
+#ifdef MITIGATION
+    s->int_cause |= cause; // remember the interrupt cause.
+    s->tx_ics_count += pending;
+    if (s->tx_ics_count >= len - 5) {
+        // if the ring is about to become full, generate an interrupt
+	set_ics(s, 0, s->int_cause);
+	s->tx_ics_count = 0;
+	s->int_cause = 0;
+    } else {	// otherwise just schedule it for later.
+        qemu_bh_schedule_idle(s->int_bh);
+    }
+}
+#else /* !MITIGATION */
     set_ics(s, 0, cause);
+#endif
 }
 
 static int
@@ -875,7 +919,27 @@ e1000_receive(VLANClientState *nc, const
         s->rxbuf_min_shift)
         n |= E1000_ICS_RXDMT0;
 
+#ifdef MITIGATION
+#define MIT_RXDMT0_SENT	100000	// large
+    s->int_cause |= n;
+    if (s->rx_ics_count == 0) {
+	/* deliver the first interrupt */
+	set_ics(s, 0, s->int_cause);
+	s->int_cause = 0;
+	s->rx_ics_count++;
+    } else if ( (n & E1000_ICS_RXDMT0) && s->rx_ics_count < MIT_RXDMT0_SENT) {
+	/* also deliver if we are approaching ring full */
+	set_ics(s, 0, s->int_cause);
+	s->int_cause = 0;
+	s->rx_ics_count = MIT_RXDMT0_SENT;
+    } else {
+	/* otherwise schedule for later */
+	s->rx_ics_count++;
+	qemu_bh_schedule_idle(s->int_bh);
+    }
+#else /* !MITIGATION */
     set_ics(s, 0, n);
+#endif /* !MITIGATION */
 
     return size;
 }
@@ -1214,6 +1281,20 @@ static NetClientInfo net_e1000_info = {
     .link_status_changed = e1000_set_link_status,
 };
 
+#ifdef MITIGATION
+static void e1000_int_bh(void *opaque)
+{
+    E1000State *s = opaque;
+    if (s->tx_ics_count < 1 && s->rx_ics_count < 1)
+	return;
+    s->tx_ics_count = 0;
+    s->rx_ics_count = 0;
+    start_xmit(s);
+    set_ics(s, 0, s->int_cause);
+    s->int_cause = 0;
+}
+#endif /* MITIGATION */
+
 static int pci_e1000_init(PCIDevice *pci_dev)
 {
     E1000State *d = DO_UPCAST(E1000State, dev, pci_dev);
@@ -1231,6 +1312,9 @@ static int pci_e1000_init(PCIDevice *pci
 
     e1000_mmio_setup(d);
 
+#ifdef MITIGATION
+    d->int_bh = qemu_bh_new(e1000_int_bh, d);
+#endif /* MITIGATION */
     pci_register_bar(&d->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);
 
     pci_register_bar(&d->dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-24 16:58 [Qemu-devel] interrupt mitigation for e1000 Luigi Rizzo
@ 2012-07-25  7:47 ` Stefan Hajnoczi
  2012-07-25  8:53 ` Avi Kivity
  1 sibling, 0 replies; 7+ messages in thread
From: Stefan Hajnoczi @ 2012-07-25  7:47 UTC (permalink / raw)
  To: Luigi Rizzo; +Cc: Jason Wang, qemu-devel, Anthony Liguori, Michael S. Tsirkin

On Tue, Jul 24, 2012 at 5:58 PM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
> I noticed that the various NIC modules in qemu/kvm do not implement
> interrupt mitigation, which is very beneficial as it dramatically
> reduces exits from the hypervisor.
>
> As a proof of concept i tried to implement it for the e1000 driver
> (patch below), and it brings tx performance from 9 to 56Kpps on
> qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.
>
> I am going to measure the rx interrupt mitigation in the next couple
> of days.
>
> Is there any interest in having this code in ?

Seems useful, especially since you have benchmark results to show the
improvement.

Please include a Signed-off-by: line for the patch when you submit it.
 For details, see:
http://wiki.qemu.org/Contribute/SubmitAPatch

I have CCed folks who have worked on the e1000 model.

Stefan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-24 16:58 [Qemu-devel] interrupt mitigation for e1000 Luigi Rizzo
  2012-07-25  7:47 ` Stefan Hajnoczi
@ 2012-07-25  8:53 ` Avi Kivity
  2012-07-25  9:56   ` Luigi Rizzo
  1 sibling, 1 reply; 7+ messages in thread
From: Avi Kivity @ 2012-07-25  8:53 UTC (permalink / raw)
  To: Luigi Rizzo; +Cc: qemu-devel

On 07/24/2012 07:58 PM, Luigi Rizzo wrote:
> I noticed that the various NIC modules in qemu/kvm do not implement
> interrupt mitigation, which is very beneficial as it dramatically
> reduces exits from the hypervisor.
> 
> As a proof of concept i tried to implement it for the e1000 driver
> (patch below), and it brings tx performance from 9 to 56Kpps on
> qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.
> 
> I am going to measure the rx interrupt mitigation in the next couple
> of days.
> 
> Is there any interest in having this code in ?

Indeed.  But please drop the #ifdef MITIGATIONs.

> +
> +#ifdef MITIGATION
> +    QEMUBH *int_bh;	// interrupt mitigation handler
> +    int tx_ics_count;	// pending tx int requests
> +    int rx_ics_count;	// pending rx int requests
> +    int int_cause;	// int cause

Use uint32_t for int_cause, also a correctly sized type for the packet
counts.

>  
> +#ifdef MITIGATION
> +    /* we transmit the first few packets, or we do if we are
> +     * approaching a full ring. in the latter case, also
> +     * send an ics.
> +     * 
> +     */
> +{
> +    int len, pending;
> +    len = s->mac_reg[TDLEN] / sizeof(desc) ;
> +    pending = s->mac_reg[TDT] - s->mac_reg[TDH];
> +    if (pending < 0)
> +	pending += len;
> +    /* ignore requests after the first few ones, as long as
> +     * we are not approaching a full ring.
> +     * Otherwise, deliver packets to the backend.
> +     */
> +    if (s->tx_ics_count > 4 && s->tx_ics_count + pending < len - 5)
> +	return;

Where do the 4 and 5 come from?  Shouldn't they be controlled by the
guest using a device register?

>      }
> +#ifdef MITIGATION
> +    s->int_cause |= cause; // remember the interrupt cause.
> +    s->tx_ics_count += pending;
> +    if (s->tx_ics_count >= len - 5) {
> +        // if the ring is about to become full, generate an interrupt

Another magic number.

> +	set_ics(s, 0, s->int_cause);
> +	s->tx_ics_count = 0;
> +	s->int_cause = 0;
> +    } else {	// otherwise just schedule it for later.
> +        qemu_bh_schedule_idle(s->int_bh);
> +    }
> +}
> +#else /* !MITIGATION */
>      set_ics(s, 0, cause);
> +#endif
>  }
>  

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-25  8:53 ` Avi Kivity
@ 2012-07-25  9:56   ` Luigi Rizzo
  2012-07-25 10:00     ` Avi Kivity
  2012-07-25 10:12     ` Paolo Bonzini
  0 siblings, 2 replies; 7+ messages in thread
From: Luigi Rizzo @ 2012-07-25  9:56 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel

On Wed, Jul 25, 2012 at 11:53:29AM +0300, Avi Kivity wrote:
> On 07/24/2012 07:58 PM, Luigi Rizzo wrote:
> > I noticed that the various NIC modules in qemu/kvm do not implement
> > interrupt mitigation, which is very beneficial as it dramatically
> > reduces exits from the hypervisor.
> > 
> > As a proof of concept i tried to implement it for the e1000 driver
> > (patch below), and it brings tx performance from 9 to 56Kpps on
> > qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.
> > 
> > I am going to measure the rx interrupt mitigation in the next couple
> > of days.
> > 
> > Is there any interest in having this code in ?
> 
> Indeed.  But please drop the #ifdef MITIGATIONs.

Thanks for the comments. The #ifdef block MITIGATION was only temporary to
point out the differences and run the performance comparisons.
Similarly, the magic thresholds below will be replaced with
appropriately commented #defines.

Note:
On the real hardware interrupt mitigation is controlled by a total of four
registers (TIDV, TADV, RIDV, RADV) which control it with a granularity
of 1024ns , see

http://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf

An exact emulation of the feature is hard, because the timer resolution we
have is much coarser (in the ms range). So i am inclined to use a different
approach, similar to the one i have implemented, namely:
- the first few packets (whether 1 or 4 or 5 will be decided on the host)
  report an interrupt immediately;
- subsequent interrupts are delayed through qemu_bh_schedule_idle()
  (which is unpredictable but efficient; i tried qemu_bh_schedule()
  but it completely defeats mitigation)
- when the TX or RX rings are close to getting full, then again
  an interrupt is delivered immediately.

This approach also has the advantage of not requiring specific support
in the OS drivers.

cheers
luigi

> > +
> > +#ifdef MITIGATION
> > +    QEMUBH *int_bh;	// interrupt mitigation handler
> > +    int tx_ics_count;	// pending tx int requests
> > +    int rx_ics_count;	// pending rx int requests
> > +    int int_cause;	// int cause
> 
> Use uint32_t for int_cause, also a correctly sized type for the packet
> counts.
> 
> >  
> > +#ifdef MITIGATION
> > +    /* we transmit the first few packets, or we do if we are
> > +     * approaching a full ring. in the latter case, also
> > +     * send an ics.
> > +     * 
> > +     */
> > +{
> > +    int len, pending;
> > +    len = s->mac_reg[TDLEN] / sizeof(desc) ;
> > +    pending = s->mac_reg[TDT] - s->mac_reg[TDH];
> > +    if (pending < 0)
> > +	pending += len;
> > +    /* ignore requests after the first few ones, as long as
> > +     * we are not approaching a full ring.
> > +     * Otherwise, deliver packets to the backend.
> > +     */
> > +    if (s->tx_ics_count > 4 && s->tx_ics_count + pending < len - 5)
> > +	return;
> 
> Where do the 4 and 5 come from?  Shouldn't they be controlled by the
> guest using a device register?
> 
> >      }
> > +#ifdef MITIGATION
> > +    s->int_cause |= cause; // remember the interrupt cause.
> > +    s->tx_ics_count += pending;
> > +    if (s->tx_ics_count >= len - 5) {
> > +        // if the ring is about to become full, generate an interrupt
> 
> Another magic number.
> 
> > +	set_ics(s, 0, s->int_cause);
> > +	s->tx_ics_count = 0;
> > +	s->int_cause = 0;
> > +    } else {	// otherwise just schedule it for later.
> > +        qemu_bh_schedule_idle(s->int_bh);
> > +    }
> > +}
> > +#else /* !MITIGATION */
> >      set_ics(s, 0, cause);
> > +#endif
> >  }
> >  
> 
> -- 
> error compiling committee.c: too many arguments to function
> 
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-25  9:56   ` Luigi Rizzo
@ 2012-07-25 10:00     ` Avi Kivity
  2012-07-25 10:12     ` Paolo Bonzini
  1 sibling, 0 replies; 7+ messages in thread
From: Avi Kivity @ 2012-07-25 10:00 UTC (permalink / raw)
  To: Luigi Rizzo; +Cc: qemu-devel

On 07/25/2012 12:56 PM, Luigi Rizzo wrote:

>> Indeed.  But please drop the #ifdef MITIGATIONs.
> 
> Thanks for the comments. The #ifdef block MITIGATION was only temporary to
> point out the differences and run the performance comparisons.

Ok.  In a patch, the '+' in front of a line serves that, and I usually
just check out the previous version to run a performance comparison.

> Similarly, the magic thresholds below will be replaced with
> appropriately commented #defines.
> 
> Note:
> On the real hardware interrupt mitigation is controlled by a total of four
> registers (TIDV, TADV, RIDV, RADV) which control it with a granularity
> of 1024ns , see
> 
> http://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> 
> An exact emulation of the feature is hard, because the timer resolution we
> have is much coarser (in the ms range).

No, timers have ns precision in Linux.

> So i am inclined to use a different
> approach, similar to the one i have implemented, namely:
> - the first few packets (whether 1 or 4 or 5 will be decided on the host)
>   report an interrupt immediately;
> - subsequent interrupts are delayed through qemu_bh_schedule_idle()
>   (which is unpredictable but efficient; i tried qemu_bh_schedule()
>   but it completely defeats mitigation)
> - when the TX or RX rings are close to getting full, then again
>   an interrupt is delivered immediately.
> 
> This approach also has the advantage of not requiring specific support
> in the OS drivers.
>

But the disadvantage, that if a guest explicitly chooses not to use
interrupt mitigation, in order to reduce latency, then that choice is
ignored.

We should follow the hardware as closely as possibly (but no closer).

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-25  9:56   ` Luigi Rizzo
  2012-07-25 10:00     ` Avi Kivity
@ 2012-07-25 10:12     ` Paolo Bonzini
  2012-07-25 10:54       ` Luigi Rizzo
  1 sibling, 1 reply; 7+ messages in thread
From: Paolo Bonzini @ 2012-07-25 10:12 UTC (permalink / raw)
  To: Luigi Rizzo; +Cc: Avi Kivity, qemu-devel

Il 25/07/2012 11:56, Luigi Rizzo ha scritto:
> On Wed, Jul 25, 2012 at 11:53:29AM +0300, Avi Kivity wrote:
>> On 07/24/2012 07:58 PM, Luigi Rizzo wrote:
>>> I noticed that the various NIC modules in qemu/kvm do not implement
>>> interrupt mitigation, which is very beneficial as it dramatically
>>> reduces exits from the hypervisor.
>>>
>>> As a proof of concept i tried to implement it for the e1000 driver
>>> (patch below), and it brings tx performance from 9 to 56Kpps on
>>> qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.
>>>
>>> I am going to measure the rx interrupt mitigation in the next couple
>>> of days.
>>>
>>> Is there any interest in having this code in ?
>>
>> Indeed.  But please drop the #ifdef MITIGATIONs.
> 
> Thanks for the comments. The #ifdef block MITIGATION was only temporary to
> point out the differences and run the performance comparisons.
> Similarly, the magic thresholds below will be replaced with
> appropriately commented #defines.
> 
> Note:
> On the real hardware interrupt mitigation is controlled by a total of four
> registers (TIDV, TADV, RIDV, RADV) which control it with a granularity
> of 1024ns , see
> 
> http://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> 
> An exact emulation of the feature is hard, because the timer resolution we
> have is much coarser (in the ms range). So i am inclined to use a different
> approach, similar to the one i have implemented, namely:
> - the first few packets (whether 1 or 4 or 5 will be decided on the host)
>   report an interrupt immediately;
> - subsequent interrupts are delayed through qemu_bh_schedule_idle()

qemu_bh_schedule_idle() is really a 10ms timer.

Paolo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] interrupt mitigation for e1000
  2012-07-25 10:12     ` Paolo Bonzini
@ 2012-07-25 10:54       ` Luigi Rizzo
  0 siblings, 0 replies; 7+ messages in thread
From: Luigi Rizzo @ 2012-07-25 10:54 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Avi Kivity, qemu-devel

On Wed, Jul 25, 2012 at 12:12:55PM +0200, Paolo Bonzini wrote:
> Il 25/07/2012 11:56, Luigi Rizzo ha scritto:
> > On Wed, Jul 25, 2012 at 11:53:29AM +0300, Avi Kivity wrote:
> >> On 07/24/2012 07:58 PM, Luigi Rizzo wrote:
> >>> I noticed that the various NIC modules in qemu/kvm do not implement
> >>> interrupt mitigation, which is very beneficial as it dramatically
> >>> reduces exits from the hypervisor.
> >>>
> >>> As a proof of concept i tried to implement it for the e1000 driver
> >>> (patch below), and it brings tx performance from 9 to 56Kpps on
> >>> qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.
> >>>
> >>> I am going to measure the rx interrupt mitigation in the next couple
> >>> of days.
> >>>
> >>> Is there any interest in having this code in ?
> >>
> >> Indeed.  But please drop the #ifdef MITIGATIONs.
> > 
> > Thanks for the comments. The #ifdef block MITIGATION was only temporary to
> > point out the differences and run the performance comparisons.
> > Similarly, the magic thresholds below will be replaced with
> > appropriately commented #defines.
> > 
> > Note:
> > On the real hardware interrupt mitigation is controlled by a total of four
> > registers (TIDV, TADV, RIDV, RADV) which control it with a granularity
> > of 1024ns , see
> > 
> > http://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> > 
> > An exact emulation of the feature is hard, because the timer resolution we
> > have is much coarser (in the ms range). So i am inclined to use a different
> > approach, similar to the one i have implemented, namely:
> > - the first few packets (whether 1 or 4 or 5 will be decided on the host)
> >   report an interrupt immediately;
> > - subsequent interrupts are delayed through qemu_bh_schedule_idle()
> 
> qemu_bh_schedule_idle() is really a 10ms timer.

yes, i figured that out, this is why i said that my code was more
a "proof of concept" than an actual patch.

If you have a suggestion on how to schedule a shorter (say 1ms) timer i
am all hears. Perhaps qemu_new_timer_ns() and friends ?
 
This said, i do not plan to implement the full mitigation registers
controlled by the guest, just possibly use a parameter as in virtio-net
where you can have
'tx=bh' or 'tx=timer' and 'x-txtimer=N' with N is the mitigation delay
in nanoseconds (virtually, in practice rounded to whatever the host granularity is)

cheers
luigi

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2012-07-25 10:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-24 16:58 [Qemu-devel] interrupt mitigation for e1000 Luigi Rizzo
2012-07-25  7:47 ` Stefan Hajnoczi
2012-07-25  8:53 ` Avi Kivity
2012-07-25  9:56   ` Luigi Rizzo
2012-07-25 10:00     ` Avi Kivity
2012-07-25 10:12     ` Paolo Bonzini
2012-07-25 10:54       ` Luigi Rizzo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.