All of lore.kernel.org
 help / color / mirror / Atom feed
* QEMU PIC indirection patch for in-kernel APIC work
@ 2007-04-02 11:46 Gregory Haskins
       [not found] ` <4610A6A9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Gregory Haskins @ 2007-04-02 11:46 UTC (permalink / raw)
  To: Dor Laor; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

[-- Attachment #1: Type: text/plain, Size: 1381 bytes --]


Hi Dor,
  Please find a patch attached for your review which adds support for dynamic substitution of the PIC/APIC code to QEMU. This will allow us to selectively chose the KVM in-kernel apic emulation vs the QEMU user-space apic emulation.  Support for both is key to allow "--no-kvm" type operation to continue working even after the in-kernel code is deployed.

Note that this is only the part the allows indirection.  The code that actually fills in the "slim apic" in QEMU, as well as the kernel side changes applied to git are not included here.  Note also that this patch can stand alone.  I have confirmed that I can boot a guest with no discernible difference in behavior/performance both before and after this patch. YMMV as my test cases are limited.

Note that this patch only touches the KVM specific portions of QEMU (namely, x86/i8259/apic support).  If we decide that this should be pushed to QEMU upstream, there is still work to be done to convert the other PIC implementations (e.g. arm_pic, etc) to use the new interface.

I will continue to work on the kernel/slim-apic side while I await your feedback.  (BTW: Thanks for pointing me towards your branch...it has been most helpful rather than starting from scratch.  I already have most of the kernel-side code ported already in a separate queue which I will share once I get a baseline).  

Thanks!

-Greg 


[-- Attachment #2: qemu_pic_indirection.patch --]
[-- Type: text/plain, Size: 22102 bytes --]

Index: kvm/qemu/vl.h
===================================================================
--- kvm.orig/qemu/vl.h
+++ kvm/qemu/vl.h
@@ -1040,16 +1040,11 @@ ParallelState *parallel_init(int base, i
 
 /* i8259.c */
 
-typedef struct PicState2 PicState2;
-extern PicState2 *isa_pic;
-void pic_set_irq(int irq, int level);
-void pic_set_irq_new(void *opaque, int irq, int level);
-PicState2 *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque);
-void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
-                          void *alt_irq_opaque);
-int pic_read_irq(PicState2 *s);
-void pic_update_irq(PicState2 *s);
-uint32_t pic_intack_read(PicState2 *s);
+#include "pic.h"
+
+PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque);
+void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
+			  void *alt_irq_opaque);
 void pic_info(void);
 void irq_info(void);
 
@@ -1057,7 +1052,6 @@ void irq_info(void);
 typedef struct IOAPICState IOAPICState;
 
 int apic_init(CPUState *env);
-int apic_get_interrupt(CPUState *env);
 IOAPICState *ioapic_init(void);
 void ioapic_set_irq(void *opaque, int vector, int level);
 
Index: kvm/qemu/hw/i8259.c
===================================================================
--- kvm.orig/qemu/hw/i8259.c
+++ kvm/qemu/hw/i8259.c
@@ -29,6 +29,8 @@
 //#define DEBUG_IRQ_LATENCY
 //#define DEBUG_IRQ_COUNT
 
+typedef struct PicState2 PicState2;
+
 typedef struct PicState {
     uint8_t last_irr; /* edge detection */
     uint8_t irr; /* interrupt request register */
@@ -58,6 +60,7 @@ struct PicState2 {
     /* IOAPIC callback support */
     SetIRQFunc *alt_irq_func;
     void *alt_irq_opaque;
+    PIC *base;
 };
 
 #if defined(DEBUG_PIC) || defined (DEBUG_IRQ_COUNT)
@@ -133,8 +136,9 @@ static int pic_get_irq(PicState *s)
 /* raise irq to CPU if necessary. must be called every time the active
    irq may change */
 /* XXX: should not export it, but it is needed for an APIC kludge */
-void pic_update_irq(PicState2 *s)
+static void i8259_update_irq(PIC *pic)
 {
+    PicState2 *s = (PicState2*)pic->private;
     int irq2, irq;
 
     /* first look at slave pic */
@@ -174,9 +178,9 @@ void pic_update_irq(PicState2 *s)
 int64_t irq_time[16];
 #endif
 
-void pic_set_irq_new(void *opaque, int irq, int level)
+static void i8259_set_irq(PIC *pic, int irq, int level)
 {
-    PicState2 *s = opaque;
+    PicState2 *s = (PicState2*)pic->private;
 
 #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
     if (level != irq_level[irq]) {
@@ -199,13 +203,7 @@ void pic_set_irq_new(void *opaque, int i
     /* used for IOAPIC irqs */
     if (s->alt_irq_func)
         s->alt_irq_func(s->alt_irq_opaque, irq, level);
-    pic_update_irq(s);
-}
-
-/* obsolete function */
-void pic_set_irq(int irq, int level)
-{
-    pic_set_irq_new(isa_pic, irq, level);
+    pic_update_irq(pic);
 }
 
 /* acknowledge interrupt 'irq' */
@@ -231,8 +229,9 @@ static inline void pic_intack(PicState *
     }
 }
 
-int pic_read_irq(PicState2 *s)
+static int i8259_read_irq(PIC *pic)
 {
+    PicState2 *s = (PicState2*)pic->private;
     int irq, irq2, intno;
 
     irq = pic_get_irq(&s->pics[0]);
@@ -256,7 +255,7 @@ int pic_read_irq(PicState2 *s)
         irq = 7;
         intno = s->pics[0].irq_base + irq;
     }
-    pic_update_irq(s);
+    pic_update_irq(pic);
         
 #ifdef DEBUG_IRQ_LATENCY
     printf("IRQ%d latency=%0.3fus\n", 
@@ -333,23 +332,23 @@ static void pic_ioport_write(void *opaqu
                     s->isr &= ~(1 << irq);
                     if (cmd == 5)
                         s->priority_add = (irq + 1) & 7;
-                    pic_update_irq(s->pics_state);
+                    pic_update_irq(s->pics_state->base);
                 }
                 break;
             case 3:
                 irq = val & 7;
                 s->isr &= ~(1 << irq);
-                pic_update_irq(s->pics_state);
+                pic_update_irq(s->pics_state->base);
                 break;
             case 6:
                 s->priority_add = (val + 1) & 7;
-                pic_update_irq(s->pics_state);
+                pic_update_irq(s->pics_state->base);
                 break;
             case 7:
                 irq = val & 7;
                 s->isr &= ~(1 << irq);
                 s->priority_add = (irq + 1) & 7;
-                pic_update_irq(s->pics_state);
+                pic_update_irq(s->pics_state->base);
                 break;
             default:
                 /* no operation */
@@ -361,7 +360,7 @@ static void pic_ioport_write(void *opaqu
         case 0:
             /* normal mode */
             s->imr = val;
-            pic_update_irq(s->pics_state);
+            pic_update_irq(s->pics_state->base);
             break;
         case 1:
             s->irq_base = val & 0xf8;
@@ -396,10 +395,10 @@ static uint32_t pic_poll_read (PicState 
         s->irr &= ~(1 << ret);
         s->isr &= ~(1 << ret);
         if (addr1 >> 7 || ret != 2)
-            pic_update_irq(s->pics_state);
+            pic_update_irq(s->pics_state->base);
     } else {
         ret = 0x07;
-        pic_update_irq(s->pics_state);
+        pic_update_irq(s->pics_state->base);
     }
 
     return ret;
@@ -434,8 +433,9 @@ static uint32_t pic_ioport_read(void *op
 
 /* memory mapped interrupt status */
 /* XXX: may be the same than pic_read_irq() */
-uint32_t pic_intack_read(PicState2 *s)
+static uint32_t i8259_intack_read(PIC *pic)
 {
+    PicState2 *s = (PicState2*)pic->private;
     int ret;
 
     ret = pic_poll_read(&s->pics[0], 0x00);
@@ -518,16 +518,14 @@ static void pic_init1(int io_addr, int e
     qemu_register_reset(pic_reset, s);
 }
 
-void pic_info(void)
+static void i8259_info(PIC *pic)
 {
     int i;
+    PicState2 *s2 = (PicState2*)pic->private;
     PicState *s;
     
-    if (!isa_pic)
-        return;
-
     for(i=0;i<2;i++) {
-        s = &isa_pic->pics[i];
+        s = &s2->pics[i];
         term_printf("pic%d: irr=%02x imr=%02x isr=%02x hprio=%d irq_base=%02x rr_sel=%d elcr=%02x fnm=%d\n",
                     i, s->irr, s->imr, s->isr, s->priority_add, 
                     s->irq_base, s->read_reg_select, s->elcr, 
@@ -535,7 +533,7 @@ void pic_info(void)
     }
 }
 
-void irq_info(void)
+static void i8259_stat(PIC *pic)
 {
 #ifndef DEBUG_IRQ_COUNT
     term_printf("irq statistic code not compiled.\n");
@@ -552,12 +550,38 @@ void irq_info(void)
 #endif
 }
 
-PicState2 *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque)
+void pic_info(void)
 {
-    PicState2 *s;
-    s = qemu_mallocz(sizeof(PicState2));
-    if (!s)
+    isa_pic->info(isa_pic);
+}
+
+void irq_info(void)
+{
+    isa_pic->stat(isa_pic);
+}
+
+PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque)
+{
+    PIC *pic = qemu_mallocz(sizeof(PIC));
+    if (!pic)
+	return NULL;
+
+    pic->set         = i8259_set_irq;
+    pic->read        = i8259_read_irq;
+    pic->intack_read = i8259_intack_read;
+    pic->update      = i8259_update_irq;
+    pic->info        = i8259_info;
+    pic->stat        = i8259_stat;
+
+    pic->private = qemu_mallocz(sizeof(PicState2));
+    if (!pic->private) {
+	qemu_free(pic);
         return NULL;
+    }
+
+    PicState2 *s = (PicState2*)pic->private;
+
+    s->base = pic;
     pic_init1(0x20, 0x4d0, &s->pics[0]);
     pic_init1(0xa0, 0x4d1, &s->pics[1]);
     s->pics[0].elcr_mask = 0xf8;
@@ -566,12 +590,14 @@ PicState2 *pic_init(IRQRequestFunc *irq_
     s->irq_request_opaque = irq_request_opaque;
     s->pics[0].pics_state = s;
     s->pics[1].pics_state = s;
-    return s;
+    return pic;
 }
 
-void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
+void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
                           void *alt_irq_opaque)
 {
+    PicState2 *s = (PicState2*)pic->private;
+
     s->alt_irq_func = alt_irq_func;
     s->alt_irq_opaque = alt_irq_opaque;
 }
Index: kvm/qemu/hw/ide.c
===================================================================
--- kvm.orig/qemu/hw/ide.c
+++ kvm/qemu/hw/ide.c
@@ -2190,7 +2190,7 @@ void isa_ide_init(int iobase, int iobase
     if (!ide_state)
         return;
     
-    ide_init2(ide_state, hd0, hd1, pic_set_irq_new, isa_pic, irq);
+    ide_init2(ide_state, hd0, hd1, (SetIRQFunc *)isa_pic->set, isa_pic, irq);
     ide_init_ioport(ide_state, iobase, iobase2);
 }
 
@@ -2617,9 +2617,9 @@ void pci_piix_ide_init(PCIBus *bus, Bloc
                            PCI_ADDRESS_SPACE_IO, bmdma_map);
 
     ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
-              pic_set_irq_new, isa_pic, 14);
+              (SetIRQFunc *)isa_pic->set, isa_pic, 14);
     ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
-              pic_set_irq_new, isa_pic, 15);
+              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
@@ -2656,9 +2656,9 @@ void pci_piix3_ide_init(PCIBus *bus, Blo
                            PCI_ADDRESS_SPACE_IO, bmdma_map);
 
     ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
-              pic_set_irq_new, isa_pic, 14);
+             (SetIRQFunc *)isa_pic->set, isa_pic, 14);
     ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
-              pic_set_irq_new, isa_pic, 15);
+              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
Index: kvm/qemu/hw/pc.c
===================================================================
--- kvm.orig/qemu/hw/pc.c
+++ kvm/qemu/hw/pc.c
@@ -89,18 +89,7 @@ void cpu_smm_update(CPUState *env)
 /* IRQ handling */
 int cpu_get_pic_interrupt(CPUState *env)
 {
-    int intno;
-
-    intno = apic_get_interrupt(env);
-    if (intno >= 0) {
-        /* set irq request if a PIC irq is still pending */
-        /* XXX: improve that */
-        pic_update_irq(isa_pic); 
-        return intno;
-    }
-    /* read the irq from the PIC */
-    intno = pic_read_irq(isa_pic);
-    return intno;
+    return apic_read_irq(env);
 }
 
 static void pic_irq_request(void *opaque, int level)
@@ -679,7 +668,7 @@ static void pc_init1(int ram_size, int v
 
     for(i = 0; i < MAX_SERIAL_PORTS; i++) {
         if (serial_hds[i]) {
-            serial_init(&pic_set_irq_new, isa_pic,
+            serial_init((SetIRQFunc *)isa_pic->set, isa_pic,
                         serial_io[i], serial_irq[i], serial_hds[i]);
         }
     }
Index: kvm/qemu/vl.c
===================================================================
--- kvm.orig/qemu/vl.c
+++ kvm/qemu/vl.c
@@ -186,7 +186,7 @@ int time_drift_fix = 1;
 /* x86 ISA bus support */
 
 target_phys_addr_t isa_mem_base = 0;
-PicState2 *isa_pic;
+PIC *isa_pic;
 
 uint32_t default_ioport_readb(void *opaque, uint32_t address)
 {
Index: kvm/qemu/hw/apic.c
===================================================================
--- kvm.orig/qemu/hw/apic.c
+++ kvm/qemu/hw/apic.c
@@ -84,6 +84,7 @@ typedef struct APICState {
     uint32_t initial_count;
     int64_t initial_count_load_time, next_time;
     QEMUTimer *timer;
+    CPUState *env;
 } APICState;
 
 struct IOAPICState {
@@ -235,9 +236,9 @@ static void apic_bus_deliver(const uint3
                  apic_set_irq(apic_iter, vector_num, trigger_mode) );
 }
 
-void cpu_set_apic_base(CPUState *env, uint64_t val)
+static void _apic_set_base(APIC *apic, uint64_t val)
 {
-    APICState *s = env->apic_state;
+    APICState *s = (APICState*)apic->private;
 #ifdef DEBUG_APIC
     printf("cpu_set_apic_base: %016" PRIx64 "\n", val);
 #endif
@@ -246,30 +247,30 @@ void cpu_set_apic_base(CPUState *env, ui
     /* if disabled, cannot be enabled again */
     if (!(val & MSR_IA32_APICBASE_ENABLE)) {
         s->apicbase &= ~MSR_IA32_APICBASE_ENABLE;
-        env->cpuid_features &= ~CPUID_APIC;
+        s->env->cpuid_features &= ~CPUID_APIC;
         s->spurious_vec &= ~APIC_SV_ENABLE;
     }
 }
 
-uint64_t cpu_get_apic_base(CPUState *env)
+static uint64_t _apic_get_base(APIC *apic)
 {
-    APICState *s = env->apic_state;
+    APICState *s = (APICState*)apic->private;
 #ifdef DEBUG_APIC
     printf("cpu_get_apic_base: %016" PRIx64 "\n", (uint64_t)s->apicbase);
 #endif
     return s->apicbase;
 }
 
-void cpu_set_apic_tpr(CPUX86State *env, uint8_t val)
+static void _apic_set_tpr(APIC *apic, uint8_t val)
 {
-    APICState *s = env->apic_state;
+    APICState *s = (APICState*)apic->private;
     s->tpr = (val & 0x0f) << 4;
     apic_update_irq(s);
 }
 
-uint8_t cpu_get_apic_tpr(CPUX86State *env)
+static uint8_t _apic_get_tpr(APIC *apic)
 {
-    APICState *s = env->apic_state;
+    APICState *s = (APICState*)apic->private;
     return s->tpr >> 4;
 }
 
@@ -460,28 +461,34 @@ static void apic_deliver(APICState *s, u
                      trigger_mode);
 }
 
-int apic_get_interrupt(CPUState *env)
+static int _apic_read_irq(APIC *apic)
 {
-    APICState *s = env->apic_state;
+    APICState *s = (APICState*)apic->private;
     int intno;
 
     /* if the APIC is installed or enabled, we let the 8259 handle the
        IRQs */
-    if (!s)
-        return -1;
-    if (!(s->spurious_vec & APIC_SV_ENABLE))
-        return -1;
+    if (!s || !(s->spurious_vec & APIC_SV_ENABLE))
+	goto use_pic;
     
     /* XXX: spurious IRQ handling */
     intno = get_highest_priority_int(s->irr);
     if (intno < 0)
-        return -1;
+	goto use_pic;
     if (s->tpr && intno <= s->tpr)
         return s->spurious_vec & 0xff;
     reset_bit(s->irr, intno);
     set_bit(s->isr, intno);
     apic_update_irq(s);
+    
+    /* set irq request if a PIC irq is still pending */
+    /* XXX: improve that */
+    pic_update_irq(isa_pic); 
+
     return intno;
+
+ use_pic:
+    return pic_read_irq(isa_pic);
 }
 
 static uint32_t apic_get_current_count(APICState *s)
@@ -563,7 +570,7 @@ static uint32_t apic_mem_readl(void *opa
     env = cpu_single_env;
     if (!env)
         return 0;
-    s = env->apic_state;
+    s = (APICState*)((APIC*)(env->apic))->private;
 
     index = (addr >> 4) & 0xff;
     switch(index) {
@@ -640,7 +647,7 @@ static void apic_mem_writel(void *opaque
     env = cpu_single_env;
     if (!env)
         return;
-    s = env->apic_state;
+    s = (APICState*)((APIC*)(env->apic))->private;
 
 #ifdef DEBUG_APIC
     printf("APIC write: %08x = %08x\n", (uint32_t)addr, val);
@@ -806,14 +813,31 @@ static CPUWriteMemoryFunc *apic_mem_writ
 
 int apic_init(CPUState *env)
 {
+    APIC      *apic;
     APICState *s;
 
     if (last_apic_id >= MAX_APICS)
         return -1;
+ 
+    apic = qemu_mallocz(sizeof(APIC));
+    if(!apic)
+	return -1;
+
     s = qemu_mallocz(sizeof(APICState));
-    if (!s)
+    if (!s) {
+	qemu_free(apic);
         return -1;
-    env->apic_state = s;
+    }
+    env->apic = apic;
+    s->env = env;
+
+    apic->read_irq = _apic_read_irq;
+    apic->set_base = _apic_set_base;
+    apic->get_base = _apic_get_base;
+    apic->set_tpr  = _apic_set_tpr;
+    apic->get_tpr  = _apic_get_tpr;
+
+    apic->private = s;
     apic_init_ipi(s);
     s->id = last_apic_id++;
     s->cpu_env = env;
Index: kvm/qemu/target-i386/cpu.h
===================================================================
--- kvm.orig/qemu/target-i386/cpu.h
+++ kvm/qemu/target-i386/cpu.h
@@ -553,7 +553,7 @@ typedef struct CPUX86State {
 
     /* in order to simplify APIC support, we leave this pointer to the
        user */
-    struct APICState *apic_state;
+    void *apic;
 } CPUX86State;
 
 CPUX86State *cpu_x86_init(void);
@@ -651,12 +651,6 @@ void cpu_x86_set_a20(CPUX86State *env, i
 
 uint64_t cpu_get_tsc(CPUX86State *env);
 
-void cpu_set_apic_base(CPUX86State *env, uint64_t val);
-uint64_t cpu_get_apic_base(CPUX86State *env);
-void cpu_set_apic_tpr(CPUX86State *env, uint8_t val);
-#ifndef NO_CPU_IO_DEFS
-uint8_t cpu_get_apic_tpr(CPUX86State *env);
-#endif
 void cpu_smm_update(CPUX86State *env);
 
 /* will be suppressed */
Index: kvm/qemu/qemu-kvm.c
===================================================================
--- kvm.orig/qemu/qemu-kvm.c
+++ kvm/qemu/qemu-kvm.c
@@ -7,6 +7,7 @@
 #include "exec.h"
 
 #include "qemu-kvm.h"
+#include "pic.h"
 #include <kvmctl.h>
 #include <string.h>
 
@@ -215,9 +216,9 @@ static void load_regs(CPUState *env)
     sregs.cr3 = env->cr[3];
     sregs.cr4 = env->cr[4];
 
-    sregs.apic_base = cpu_get_apic_base(env);
+    sregs.apic_base = apic_get_base(env);
     sregs.efer = env->efer;
-    sregs.cr8 = cpu_get_apic_tpr(env);
+    sregs.cr8 = apic_get_tpr(env);
 
     kvm_set_sregs(kvm_context, 0, &sregs);
 
@@ -298,7 +299,7 @@ static void save_regs(CPUState *env)
     env->cr[3] = sregs.cr3;
     env->cr[4] = sregs.cr4;
 
-    cpu_set_apic_base(env, sregs.apic_base);
+    apic_set_base(env, sregs.apic_base);
 
     env->efer = sregs.efer;
     //cpu_set_apic_tpr(env, sregs.cr8);
@@ -403,7 +404,7 @@ static void post_kvm_run(void *opaque, s
     env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
     env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
     //cpu_set_apic_tpr(env, kvm_run->cr8);
-    cpu_set_apic_base(env, kvm_run->apic_base);
+    apic_set_base(env, kvm_run->apic_base);
 }
 
 static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
@@ -411,7 +412,7 @@ static void pre_kvm_run(void *opaque, st
     CPUState **envs = opaque, *env;
     env = envs[0];
 
-    kvm_run->cr8 = cpu_get_apic_tpr(env);
+    kvm_run->cr8 = apic_get_tpr(env);
 }
 
 void kvm_load_registers(CPUState *env)
Index: kvm/qemu/target-i386/helper.c
===================================================================
--- kvm.orig/qemu/target-i386/helper.c
+++ kvm/qemu/target-i386/helper.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "exec.h"
+#include "pic.h"
 #ifdef USE_KVM
 extern int kvm_allowed;
 #endif
@@ -2650,7 +2651,7 @@ void helper_movl_crN_T0(int reg)
         cpu_x86_update_cr4(env, T0);
         break;
     case 8:
-        cpu_set_apic_tpr(env, T0);
+        apic_set_tpr(env, T0);
         break;
     default:
         env->cr[reg] = T0;
@@ -2708,7 +2709,7 @@ void helper_wrmsr(void)
         env->sysenter_eip = val;
         break;
     case MSR_IA32_APICBASE:
-        cpu_set_apic_base(env, val);
+        apic_set_base(env, val);
         break;
     case MSR_EFER:
         {
@@ -2772,7 +2773,7 @@ void helper_rdmsr(void)
         val = env->sysenter_eip;
         break;
     case MSR_IA32_APICBASE:
-        val = cpu_get_apic_base(env);
+        val = apic_get_base(env);
         break;
     case MSR_EFER:
         val = env->efer;
Index: kvm/qemu/target-i386/op.c
===================================================================
--- kvm.orig/qemu/target-i386/op.c
+++ kvm/qemu/target-i386/op.c
@@ -20,6 +20,7 @@
 
 #define ASM_SOFTMMU
 #include "exec.h"
+#include "pic.h"
 
 /* n must be a constant to be efficient */
 static inline target_long lshift(target_long x, int n)
@@ -1246,7 +1247,7 @@ void OPPROTO op_movl_crN_T0(void)
 #if !defined(CONFIG_USER_ONLY) 
 void OPPROTO op_movtl_T0_cr8(void)
 {
-    T0 = cpu_get_apic_tpr(env);
+    T0 = apic_get_tpr(env);
 }
 #endif
 
Index: kvm/qemu/pic.h
===================================================================
--- /dev/null
+++ kvm/qemu/pic.h
@@ -0,0 +1,52 @@
+/************************************************************************
+ * pic.h: Provides an indirection layer for the (A)PIC which allows 
+ *        dynamic substituion of the interrupt handling code.  This
+ *        will allow us to selectively chose the KVM in-kernel apic 
+ *        emulation vs the QEMU user-space apic emulation.  Support
+ *        for both is key to allow "--no-kvm" type operation to continue
+ *        working even after the in-kernel code is deployed.
+ *
+ *        Written by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ ***********************************************************************/
+
+#ifndef QEMU_PIC_H
+#define QEMU_PIC_H
+
+typedef struct PIC_tag
+{
+    void     (*set)(struct PIC_tag *this, int irq, int level);
+    int      (*read)(struct PIC_tag *this);
+    uint32_t (*intack_read)(struct PIC_tag *this);
+    void     (*update)(struct PIC_tag *this);
+    void     (*info)(struct PIC_tag *this);
+    void     (*stat)(struct PIC_tag *this);
+    
+    void *private;
+}PIC;
+
+extern PIC *isa_pic;
+#define pic_set_irq(irq, level)          isa_pic->set(isa_pic, irq, level)
+#define pic_set_irq_new(pic, irq, level) pic->set(pic, irq, level)
+#define pic_read_irq(pic)                pic->read(pic)
+#define pic_intack_read(pic)             pic->intack_read(pic)
+#define pic_update_irq(pic)              pic->update(pic)
+
+typedef struct APIC_tag
+{
+    int      (*read_irq)(void *this);
+    void     (*set_base)(void *this, uint64_t val);
+    uint64_t (*get_base)(void *this);
+    void     (*set_tpr)(void *this, uint8_t val);
+    uint8_t  (*get_tpr)(void *this);
+
+    void *private;
+}APIC;
+
+#define apic_read_irq(env)      ((APIC*)(env->apic))->read_irq(env->apic)
+#define apic_set_base(env, val) ((APIC*)(env->apic))->set_base(env->apic, val)
+#define apic_get_base(env)      ((APIC*)(env->apic))->get_base(env->apic)
+#define apic_set_tpr(env, val)  ((APIC*)(env->apic))->set_tpr(env->apic, val)
+#define apic_get_tpr(env)       ((APIC*)(env->apic))->get_tpr(env->apic)
+
+#endif /* QEMU_PIC_H */

[-- Attachment #3: Type: text/plain, Size: 345 bytes --]

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

[-- Attachment #4: Type: text/plain, Size: 186 bytes --]

_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found] ` <4610A6A9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-04-04  6:53   ` Avi Kivity
       [not found]     ` <46134B74.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04  6:53 UTC (permalink / raw)
  To: Gregory Haskins, Anthony Liguori
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Gregory Haskins wrote:
> Hi Dor,
>   Please find a patch attached for your review which adds support for dynamic substitution of the PIC/APIC code to QEMU. This will allow us to selectively chose the KVM in-kernel apic emulation vs the QEMU user-space apic emulation.  Support for both is key to allow "--no-kvm" type operation to continue working even after the in-kernel code is deployed.
>
> Note that this is only the part the allows indirection.  The code that actually fills in the "slim apic" in QEMU, as well as the kernel side changes applied to git are not included here.  Note also that this patch can stand alone.  I have confirmed that I can boot a guest with no discernible difference in behavior/performance both before and after this patch. YMMV as my test cases are limited.
>
> Note that this patch only touches the KVM specific portions of QEMU (namely, x86/i8259/apic support).  If we decide that this should be pushed to QEMU upstream, there is still work to be done to convert the other PIC implementations (e.g. arm_pic, etc) to use the new interface.
>   

While the approach is technically better than #ifdefing things away, 
this patch would really cause us to drift too far from the qemu 
codebase, making the eventual merge back (and any intervening merges to 
kvm) much more difficult.  We also have no guarantee that upstream qemu 
will accept this change.

If we could get this accepted into upstream qemu, and then merge 
qemu-devel, that would resolve these issues.

Anthony, you're our qemu expert.  What's your opinion?



> Thanks!
>
> -Greg 
>
>   
> ------------------------------------------------------------------------
>
> Index: kvm/qemu/vl.h
> ===================================================================
> --- kvm.orig/qemu/vl.h
> +++ kvm/qemu/vl.h
> @@ -1040,16 +1040,11 @@ ParallelState *parallel_init(int base, i
>  
>  /* i8259.c */
>  
> -typedef struct PicState2 PicState2;
> -extern PicState2 *isa_pic;
> -void pic_set_irq(int irq, int level);
> -void pic_set_irq_new(void *opaque, int irq, int level);
> -PicState2 *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque);
> -void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
> -                          void *alt_irq_opaque);
> -int pic_read_irq(PicState2 *s);
> -void pic_update_irq(PicState2 *s);
> -uint32_t pic_intack_read(PicState2 *s);
> +#include "pic.h"
> +
> +PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque);
> +void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
> +			  void *alt_irq_opaque);
>  void pic_info(void);
>  void irq_info(void);
>  
> @@ -1057,7 +1052,6 @@ void irq_info(void);
>  typedef struct IOAPICState IOAPICState;
>  
>  int apic_init(CPUState *env);
> -int apic_get_interrupt(CPUState *env);
>  IOAPICState *ioapic_init(void);
>  void ioapic_set_irq(void *opaque, int vector, int level);
>  
> Index: kvm/qemu/hw/i8259.c
> ===================================================================
> --- kvm.orig/qemu/hw/i8259.c
> +++ kvm/qemu/hw/i8259.c
> @@ -29,6 +29,8 @@
>  //#define DEBUG_IRQ_LATENCY
>  //#define DEBUG_IRQ_COUNT
>  
> +typedef struct PicState2 PicState2;
> +
>  typedef struct PicState {
>      uint8_t last_irr; /* edge detection */
>      uint8_t irr; /* interrupt request register */
> @@ -58,6 +60,7 @@ struct PicState2 {
>      /* IOAPIC callback support */
>      SetIRQFunc *alt_irq_func;
>      void *alt_irq_opaque;
> +    PIC *base;
>  };
>  
>  #if defined(DEBUG_PIC) || defined (DEBUG_IRQ_COUNT)
> @@ -133,8 +136,9 @@ static int pic_get_irq(PicState *s)
>  /* raise irq to CPU if necessary. must be called every time the active
>     irq may change */
>  /* XXX: should not export it, but it is needed for an APIC kludge */
> -void pic_update_irq(PicState2 *s)
> +static void i8259_update_irq(PIC *pic)
>  {
> +    PicState2 *s = (PicState2*)pic->private;
>      int irq2, irq;
>  
>      /* first look at slave pic */
> @@ -174,9 +178,9 @@ void pic_update_irq(PicState2 *s)
>  int64_t irq_time[16];
>  #endif
>  
> -void pic_set_irq_new(void *opaque, int irq, int level)
> +static void i8259_set_irq(PIC *pic, int irq, int level)
>  {
> -    PicState2 *s = opaque;
> +    PicState2 *s = (PicState2*)pic->private;
>  
>  #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
>      if (level != irq_level[irq]) {
> @@ -199,13 +203,7 @@ void pic_set_irq_new(void *opaque, int i
>      /* used for IOAPIC irqs */
>      if (s->alt_irq_func)
>          s->alt_irq_func(s->alt_irq_opaque, irq, level);
> -    pic_update_irq(s);
> -}
> -
> -/* obsolete function */
> -void pic_set_irq(int irq, int level)
> -{
> -    pic_set_irq_new(isa_pic, irq, level);
> +    pic_update_irq(pic);
>  }
>  
>  /* acknowledge interrupt 'irq' */
> @@ -231,8 +229,9 @@ static inline void pic_intack(PicState *
>      }
>  }
>  
> -int pic_read_irq(PicState2 *s)
> +static int i8259_read_irq(PIC *pic)
>  {
> +    PicState2 *s = (PicState2*)pic->private;
>      int irq, irq2, intno;
>  
>      irq = pic_get_irq(&s->pics[0]);
> @@ -256,7 +255,7 @@ int pic_read_irq(PicState2 *s)
>          irq = 7;
>          intno = s->pics[0].irq_base + irq;
>      }
> -    pic_update_irq(s);
> +    pic_update_irq(pic);
>          
>  #ifdef DEBUG_IRQ_LATENCY
>      printf("IRQ%d latency=%0.3fus\n", 
> @@ -333,23 +332,23 @@ static void pic_ioport_write(void *opaqu
>                      s->isr &= ~(1 << irq);
>                      if (cmd == 5)
>                          s->priority_add = (irq + 1) & 7;
> -                    pic_update_irq(s->pics_state);
> +                    pic_update_irq(s->pics_state->base);
>                  }
>                  break;
>              case 3:
>                  irq = val & 7;
>                  s->isr &= ~(1 << irq);
> -                pic_update_irq(s->pics_state);
> +                pic_update_irq(s->pics_state->base);
>                  break;
>              case 6:
>                  s->priority_add = (val + 1) & 7;
> -                pic_update_irq(s->pics_state);
> +                pic_update_irq(s->pics_state->base);
>                  break;
>              case 7:
>                  irq = val & 7;
>                  s->isr &= ~(1 << irq);
>                  s->priority_add = (irq + 1) & 7;
> -                pic_update_irq(s->pics_state);
> +                pic_update_irq(s->pics_state->base);
>                  break;
>              default:
>                  /* no operation */
> @@ -361,7 +360,7 @@ static void pic_ioport_write(void *opaqu
>          case 0:
>              /* normal mode */
>              s->imr = val;
> -            pic_update_irq(s->pics_state);
> +            pic_update_irq(s->pics_state->base);
>              break;
>          case 1:
>              s->irq_base = val & 0xf8;
> @@ -396,10 +395,10 @@ static uint32_t pic_poll_read (PicState 
>          s->irr &= ~(1 << ret);
>          s->isr &= ~(1 << ret);
>          if (addr1 >> 7 || ret != 2)
> -            pic_update_irq(s->pics_state);
> +            pic_update_irq(s->pics_state->base);
>      } else {
>          ret = 0x07;
> -        pic_update_irq(s->pics_state);
> +        pic_update_irq(s->pics_state->base);
>      }
>  
>      return ret;
> @@ -434,8 +433,9 @@ static uint32_t pic_ioport_read(void *op
>  
>  /* memory mapped interrupt status */
>  /* XXX: may be the same than pic_read_irq() */
> -uint32_t pic_intack_read(PicState2 *s)
> +static uint32_t i8259_intack_read(PIC *pic)
>  {
> +    PicState2 *s = (PicState2*)pic->private;
>      int ret;
>  
>      ret = pic_poll_read(&s->pics[0], 0x00);
> @@ -518,16 +518,14 @@ static void pic_init1(int io_addr, int e
>      qemu_register_reset(pic_reset, s);
>  }
>  
> -void pic_info(void)
> +static void i8259_info(PIC *pic)
>  {
>      int i;
> +    PicState2 *s2 = (PicState2*)pic->private;
>      PicState *s;
>      
> -    if (!isa_pic)
> -        return;
> -
>      for(i=0;i<2;i++) {
> -        s = &isa_pic->pics[i];
> +        s = &s2->pics[i];
>          term_printf("pic%d: irr=%02x imr=%02x isr=%02x hprio=%d irq_base=%02x rr_sel=%d elcr=%02x fnm=%d\n",
>                      i, s->irr, s->imr, s->isr, s->priority_add, 
>                      s->irq_base, s->read_reg_select, s->elcr, 
> @@ -535,7 +533,7 @@ void pic_info(void)
>      }
>  }
>  
> -void irq_info(void)
> +static void i8259_stat(PIC *pic)
>  {
>  #ifndef DEBUG_IRQ_COUNT
>      term_printf("irq statistic code not compiled.\n");
> @@ -552,12 +550,38 @@ void irq_info(void)
>  #endif
>  }
>  
> -PicState2 *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque)
> +void pic_info(void)
>  {
> -    PicState2 *s;
> -    s = qemu_mallocz(sizeof(PicState2));
> -    if (!s)
> +    isa_pic->info(isa_pic);
> +}
> +
> +void irq_info(void)
> +{
> +    isa_pic->stat(isa_pic);
> +}
> +
> +PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque)
> +{
> +    PIC *pic = qemu_mallocz(sizeof(PIC));
> +    if (!pic)
> +	return NULL;
> +
> +    pic->set         = i8259_set_irq;
> +    pic->read        = i8259_read_irq;
> +    pic->intack_read = i8259_intack_read;
> +    pic->update      = i8259_update_irq;
> +    pic->info        = i8259_info;
> +    pic->stat        = i8259_stat;
> +
> +    pic->private = qemu_mallocz(sizeof(PicState2));
> +    if (!pic->private) {
> +	qemu_free(pic);
>          return NULL;
> +    }
> +
> +    PicState2 *s = (PicState2*)pic->private;
> +
> +    s->base = pic;
>      pic_init1(0x20, 0x4d0, &s->pics[0]);
>      pic_init1(0xa0, 0x4d1, &s->pics[1]);
>      s->pics[0].elcr_mask = 0xf8;
> @@ -566,12 +590,14 @@ PicState2 *pic_init(IRQRequestFunc *irq_
>      s->irq_request_opaque = irq_request_opaque;
>      s->pics[0].pics_state = s;
>      s->pics[1].pics_state = s;
> -    return s;
> +    return pic;
>  }
>  
> -void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
> +void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
>                            void *alt_irq_opaque)
>  {
> +    PicState2 *s = (PicState2*)pic->private;
> +
>      s->alt_irq_func = alt_irq_func;
>      s->alt_irq_opaque = alt_irq_opaque;
>  }
> Index: kvm/qemu/hw/ide.c
> ===================================================================
> --- kvm.orig/qemu/hw/ide.c
> +++ kvm/qemu/hw/ide.c
> @@ -2190,7 +2190,7 @@ void isa_ide_init(int iobase, int iobase
>      if (!ide_state)
>          return;
>      
> -    ide_init2(ide_state, hd0, hd1, pic_set_irq_new, isa_pic, irq);
> +    ide_init2(ide_state, hd0, hd1, (SetIRQFunc *)isa_pic->set, isa_pic, irq);
>      ide_init_ioport(ide_state, iobase, iobase2);
>  }
>  
> @@ -2617,9 +2617,9 @@ void pci_piix_ide_init(PCIBus *bus, Bloc
>                             PCI_ADDRESS_SPACE_IO, bmdma_map);
>  
>      ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
> -              pic_set_irq_new, isa_pic, 14);
> +              (SetIRQFunc *)isa_pic->set, isa_pic, 14);
>      ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
> -              pic_set_irq_new, isa_pic, 15);
> +              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
>      ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
>      ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
>  
> @@ -2656,9 +2656,9 @@ void pci_piix3_ide_init(PCIBus *bus, Blo
>                             PCI_ADDRESS_SPACE_IO, bmdma_map);
>  
>      ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
> -              pic_set_irq_new, isa_pic, 14);
> +             (SetIRQFunc *)isa_pic->set, isa_pic, 14);
>      ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
> -              pic_set_irq_new, isa_pic, 15);
> +              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
>      ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
>      ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
>  
> Index: kvm/qemu/hw/pc.c
> ===================================================================
> --- kvm.orig/qemu/hw/pc.c
> +++ kvm/qemu/hw/pc.c
> @@ -89,18 +89,7 @@ void cpu_smm_update(CPUState *env)
>  /* IRQ handling */
>  int cpu_get_pic_interrupt(CPUState *env)
>  {
> -    int intno;
> -
> -    intno = apic_get_interrupt(env);
> -    if (intno >= 0) {
> -        /* set irq request if a PIC irq is still pending */
> -        /* XXX: improve that */
> -        pic_update_irq(isa_pic); 
> -        return intno;
> -    }
> -    /* read the irq from the PIC */
> -    intno = pic_read_irq(isa_pic);
> -    return intno;
> +    return apic_read_irq(env);
>  }
>  
>  static void pic_irq_request(void *opaque, int level)
> @@ -679,7 +668,7 @@ static void pc_init1(int ram_size, int v
>  
>      for(i = 0; i < MAX_SERIAL_PORTS; i++) {
>          if (serial_hds[i]) {
> -            serial_init(&pic_set_irq_new, isa_pic,
> +            serial_init((SetIRQFunc *)isa_pic->set, isa_pic,
>                          serial_io[i], serial_irq[i], serial_hds[i]);
>          }
>      }
> Index: kvm/qemu/vl.c
> ===================================================================
> --- kvm.orig/qemu/vl.c
> +++ kvm/qemu/vl.c
> @@ -186,7 +186,7 @@ int time_drift_fix = 1;
>  /* x86 ISA bus support */
>  
>  target_phys_addr_t isa_mem_base = 0;
> -PicState2 *isa_pic;
> +PIC *isa_pic;
>  
>  uint32_t default_ioport_readb(void *opaque, uint32_t address)
>  {
> Index: kvm/qemu/hw/apic.c
> ===================================================================
> --- kvm.orig/qemu/hw/apic.c
> +++ kvm/qemu/hw/apic.c
> @@ -84,6 +84,7 @@ typedef struct APICState {
>      uint32_t initial_count;
>      int64_t initial_count_load_time, next_time;
>      QEMUTimer *timer;
> +    CPUState *env;
>  } APICState;
>  
>  struct IOAPICState {
> @@ -235,9 +236,9 @@ static void apic_bus_deliver(const uint3
>                   apic_set_irq(apic_iter, vector_num, trigger_mode) );
>  }
>  
> -void cpu_set_apic_base(CPUState *env, uint64_t val)
> +static void _apic_set_base(APIC *apic, uint64_t val)
>  {
> -    APICState *s = env->apic_state;
> +    APICState *s = (APICState*)apic->private;
>  #ifdef DEBUG_APIC
>      printf("cpu_set_apic_base: %016" PRIx64 "\n", val);
>  #endif
> @@ -246,30 +247,30 @@ void cpu_set_apic_base(CPUState *env, ui
>      /* if disabled, cannot be enabled again */
>      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
>          s->apicbase &= ~MSR_IA32_APICBASE_ENABLE;
> -        env->cpuid_features &= ~CPUID_APIC;
> +        s->env->cpuid_features &= ~CPUID_APIC;
>          s->spurious_vec &= ~APIC_SV_ENABLE;
>      }
>  }
>  
> -uint64_t cpu_get_apic_base(CPUState *env)
> +static uint64_t _apic_get_base(APIC *apic)
>  {
> -    APICState *s = env->apic_state;
> +    APICState *s = (APICState*)apic->private;
>  #ifdef DEBUG_APIC
>      printf("cpu_get_apic_base: %016" PRIx64 "\n", (uint64_t)s->apicbase);
>  #endif
>      return s->apicbase;
>  }
>  
> -void cpu_set_apic_tpr(CPUX86State *env, uint8_t val)
> +static void _apic_set_tpr(APIC *apic, uint8_t val)
>  {
> -    APICState *s = env->apic_state;
> +    APICState *s = (APICState*)apic->private;
>      s->tpr = (val & 0x0f) << 4;
>      apic_update_irq(s);
>  }
>  
> -uint8_t cpu_get_apic_tpr(CPUX86State *env)
> +static uint8_t _apic_get_tpr(APIC *apic)
>  {
> -    APICState *s = env->apic_state;
> +    APICState *s = (APICState*)apic->private;
>      return s->tpr >> 4;
>  }
>  
> @@ -460,28 +461,34 @@ static void apic_deliver(APICState *s, u
>                       trigger_mode);
>  }
>  
> -int apic_get_interrupt(CPUState *env)
> +static int _apic_read_irq(APIC *apic)
>  {
> -    APICState *s = env->apic_state;
> +    APICState *s = (APICState*)apic->private;
>      int intno;
>  
>      /* if the APIC is installed or enabled, we let the 8259 handle the
>         IRQs */
> -    if (!s)
> -        return -1;
> -    if (!(s->spurious_vec & APIC_SV_ENABLE))
> -        return -1;
> +    if (!s || !(s->spurious_vec & APIC_SV_ENABLE))
> +	goto use_pic;
>      
>      /* XXX: spurious IRQ handling */
>      intno = get_highest_priority_int(s->irr);
>      if (intno < 0)
> -        return -1;
> +	goto use_pic;
>      if (s->tpr && intno <= s->tpr)
>          return s->spurious_vec & 0xff;
>      reset_bit(s->irr, intno);
>      set_bit(s->isr, intno);
>      apic_update_irq(s);
> +    
> +    /* set irq request if a PIC irq is still pending */
> +    /* XXX: improve that */
> +    pic_update_irq(isa_pic); 
> +
>      return intno;
> +
> + use_pic:
> +    return pic_read_irq(isa_pic);
>  }
>  
>  static uint32_t apic_get_current_count(APICState *s)
> @@ -563,7 +570,7 @@ static uint32_t apic_mem_readl(void *opa
>      env = cpu_single_env;
>      if (!env)
>          return 0;
> -    s = env->apic_state;
> +    s = (APICState*)((APIC*)(env->apic))->private;
>  
>      index = (addr >> 4) & 0xff;
>      switch(index) {
> @@ -640,7 +647,7 @@ static void apic_mem_writel(void *opaque
>      env = cpu_single_env;
>      if (!env)
>          return;
> -    s = env->apic_state;
> +    s = (APICState*)((APIC*)(env->apic))->private;
>  
>  #ifdef DEBUG_APIC
>      printf("APIC write: %08x = %08x\n", (uint32_t)addr, val);
> @@ -806,14 +813,31 @@ static CPUWriteMemoryFunc *apic_mem_writ
>  
>  int apic_init(CPUState *env)
>  {
> +    APIC      *apic;
>      APICState *s;
>  
>      if (last_apic_id >= MAX_APICS)
>          return -1;
> + 
> +    apic = qemu_mallocz(sizeof(APIC));
> +    if(!apic)
> +	return -1;
> +
>      s = qemu_mallocz(sizeof(APICState));
> -    if (!s)
> +    if (!s) {
> +	qemu_free(apic);
>          return -1;
> -    env->apic_state = s;
> +    }
> +    env->apic = apic;
> +    s->env = env;
> +
> +    apic->read_irq = _apic_read_irq;
> +    apic->set_base = _apic_set_base;
> +    apic->get_base = _apic_get_base;
> +    apic->set_tpr  = _apic_set_tpr;
> +    apic->get_tpr  = _apic_get_tpr;
> +
> +    apic->private = s;
>      apic_init_ipi(s);
>      s->id = last_apic_id++;
>      s->cpu_env = env;
> Index: kvm/qemu/target-i386/cpu.h
> ===================================================================
> --- kvm.orig/qemu/target-i386/cpu.h
> +++ kvm/qemu/target-i386/cpu.h
> @@ -553,7 +553,7 @@ typedef struct CPUX86State {
>  
>      /* in order to simplify APIC support, we leave this pointer to the
>         user */
> -    struct APICState *apic_state;
> +    void *apic;
>  } CPUX86State;
>  
>  CPUX86State *cpu_x86_init(void);
> @@ -651,12 +651,6 @@ void cpu_x86_set_a20(CPUX86State *env, i
>  
>  uint64_t cpu_get_tsc(CPUX86State *env);
>  
> -void cpu_set_apic_base(CPUX86State *env, uint64_t val);
> -uint64_t cpu_get_apic_base(CPUX86State *env);
> -void cpu_set_apic_tpr(CPUX86State *env, uint8_t val);
> -#ifndef NO_CPU_IO_DEFS
> -uint8_t cpu_get_apic_tpr(CPUX86State *env);
> -#endif
>  void cpu_smm_update(CPUX86State *env);
>  
>  /* will be suppressed */
> Index: kvm/qemu/qemu-kvm.c
> ===================================================================
> --- kvm.orig/qemu/qemu-kvm.c
> +++ kvm/qemu/qemu-kvm.c
> @@ -7,6 +7,7 @@
>  #include "exec.h"
>  
>  #include "qemu-kvm.h"
> +#include "pic.h"
>  #include <kvmctl.h>
>  #include <string.h>
>  
> @@ -215,9 +216,9 @@ static void load_regs(CPUState *env)
>      sregs.cr3 = env->cr[3];
>      sregs.cr4 = env->cr[4];
>  
> -    sregs.apic_base = cpu_get_apic_base(env);
> +    sregs.apic_base = apic_get_base(env);
>      sregs.efer = env->efer;
> -    sregs.cr8 = cpu_get_apic_tpr(env);
> +    sregs.cr8 = apic_get_tpr(env);
>  
>      kvm_set_sregs(kvm_context, 0, &sregs);
>  
> @@ -298,7 +299,7 @@ static void save_regs(CPUState *env)
>      env->cr[3] = sregs.cr3;
>      env->cr[4] = sregs.cr4;
>  
> -    cpu_set_apic_base(env, sregs.apic_base);
> +    apic_set_base(env, sregs.apic_base);
>  
>      env->efer = sregs.efer;
>      //cpu_set_apic_tpr(env, sregs.cr8);
> @@ -403,7 +404,7 @@ static void post_kvm_run(void *opaque, s
>      env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
>      env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
>      //cpu_set_apic_tpr(env, kvm_run->cr8);
> -    cpu_set_apic_base(env, kvm_run->apic_base);
> +    apic_set_base(env, kvm_run->apic_base);
>  }
>  
>  static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
> @@ -411,7 +412,7 @@ static void pre_kvm_run(void *opaque, st
>      CPUState **envs = opaque, *env;
>      env = envs[0];
>  
> -    kvm_run->cr8 = cpu_get_apic_tpr(env);
> +    kvm_run->cr8 = apic_get_tpr(env);
>  }
>  
>  void kvm_load_registers(CPUState *env)
> Index: kvm/qemu/target-i386/helper.c
> ===================================================================
> --- kvm.orig/qemu/target-i386/helper.c
> +++ kvm/qemu/target-i386/helper.c
> @@ -18,6 +18,7 @@
>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
>   */
>  #include "exec.h"
> +#include "pic.h"
>  #ifdef USE_KVM
>  extern int kvm_allowed;
>  #endif
> @@ -2650,7 +2651,7 @@ void helper_movl_crN_T0(int reg)
>          cpu_x86_update_cr4(env, T0);
>          break;
>      case 8:
> -        cpu_set_apic_tpr(env, T0);
> +        apic_set_tpr(env, T0);
>          break;
>      default:
>          env->cr[reg] = T0;
> @@ -2708,7 +2709,7 @@ void helper_wrmsr(void)
>          env->sysenter_eip = val;
>          break;
>      case MSR_IA32_APICBASE:
> -        cpu_set_apic_base(env, val);
> +        apic_set_base(env, val);
>          break;
>      case MSR_EFER:
>          {
> @@ -2772,7 +2773,7 @@ void helper_rdmsr(void)
>          val = env->sysenter_eip;
>          break;
>      case MSR_IA32_APICBASE:
> -        val = cpu_get_apic_base(env);
> +        val = apic_get_base(env);
>          break;
>      case MSR_EFER:
>          val = env->efer;
> Index: kvm/qemu/target-i386/op.c
> ===================================================================
> --- kvm.orig/qemu/target-i386/op.c
> +++ kvm/qemu/target-i386/op.c
> @@ -20,6 +20,7 @@
>  
>  #define ASM_SOFTMMU
>  #include "exec.h"
> +#include "pic.h"
>  
>  /* n must be a constant to be efficient */
>  static inline target_long lshift(target_long x, int n)
> @@ -1246,7 +1247,7 @@ void OPPROTO op_movl_crN_T0(void)
>  #if !defined(CONFIG_USER_ONLY) 
>  void OPPROTO op_movtl_T0_cr8(void)
>  {
> -    T0 = cpu_get_apic_tpr(env);
> +    T0 = apic_get_tpr(env);
>  }
>  #endif
>  
> Index: kvm/qemu/pic.h
> ===================================================================
> --- /dev/null
> +++ kvm/qemu/pic.h
> @@ -0,0 +1,52 @@
> +/************************************************************************
> + * pic.h: Provides an indirection layer for the (A)PIC which allows 
> + *        dynamic substituion of the interrupt handling code.  This
> + *        will allow us to selectively chose the KVM in-kernel apic 
> + *        emulation vs the QEMU user-space apic emulation.  Support
> + *        for both is key to allow "--no-kvm" type operation to continue
> + *        working even after the in-kernel code is deployed.
> + *
> + *        Written by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
> + *
> + ***********************************************************************/
> +
> +#ifndef QEMU_PIC_H
> +#define QEMU_PIC_H
> +
> +typedef struct PIC_tag
> +{
> +    void     (*set)(struct PIC_tag *this, int irq, int level);
> +    int      (*read)(struct PIC_tag *this);
> +    uint32_t (*intack_read)(struct PIC_tag *this);
> +    void     (*update)(struct PIC_tag *this);
> +    void     (*info)(struct PIC_tag *this);
> +    void     (*stat)(struct PIC_tag *this);
> +    
> +    void *private;
> +}PIC;
> +
> +extern PIC *isa_pic;
> +#define pic_set_irq(irq, level)          isa_pic->set(isa_pic, irq, level)
> +#define pic_set_irq_new(pic, irq, level) pic->set(pic, irq, level)
> +#define pic_read_irq(pic)                pic->read(pic)
> +#define pic_intack_read(pic)             pic->intack_read(pic)
> +#define pic_update_irq(pic)              pic->update(pic)
> +
> +typedef struct APIC_tag
> +{
> +    int      (*read_irq)(void *this);
> +    void     (*set_base)(void *this, uint64_t val);
> +    uint64_t (*get_base)(void *this);
> +    void     (*set_tpr)(void *this, uint8_t val);
> +    uint8_t  (*get_tpr)(void *this);
> +
> +    void *private;
> +}APIC;
> +
> +#define apic_read_irq(env)      ((APIC*)(env->apic))->read_irq(env->apic)
> +#define apic_set_base(env, val) ((APIC*)(env->apic))->set_base(env->apic, val)
> +#define apic_get_base(env)      ((APIC*)(env->apic))->get_base(env->apic)
> +#define apic_set_tpr(env, val)  ((APIC*)(env->apic))->set_tpr(env->apic, val)
> +#define apic_get_tpr(env)       ((APIC*)(env->apic))->get_tpr(env->apic)
> +
> +#endif /* QEMU_PIC_H */
>   
> ------------------------------------------------------------------------
>
> -------------------------------------------------------------------------
> Take Surveys. Earn Cash. Influence the Future of IT
> Join SourceForge.net's Techsay panel and you'll get the chance to share your
> opinions on IT & business topics through brief surveys-and earn cash
> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
> ------------------------------------------------------------------------
>
> _______________________________________________
> kvm-devel mailing list
> kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>   


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]     ` <46134B74.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 14:20       ` Anthony Liguori
       [not found]         ` <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 14:20 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Gregory Haskins wrote:
>> Hi Dor,
>>   Please find a patch attached for your review which adds support for 
>> dynamic substitution of the PIC/APIC code to QEMU. This will allow us 
>> to selectively chose the KVM in-kernel apic emulation vs the QEMU 
>> user-space apic emulation.  Support for both is key to allow 
>> "--no-kvm" type operation to continue working even after the 
>> in-kernel code is deployed.
>>
>> Note that this is only the part the allows indirection.  The code 
>> that actually fills in the "slim apic" in QEMU, as well as the kernel 
>> side changes applied to git are not included here.  Note also that 
>> this patch can stand alone.  I have confirmed that I can boot a guest 
>> with no discernible difference in behavior/performance both before 
>> and after this patch. YMMV as my test cases are limited.
>>
>> Note that this patch only touches the KVM specific portions of QEMU 
>> (namely, x86/i8259/apic support).  If we decide that this should be 
>> pushed to QEMU upstream, there is still work to be done to convert 
>> the other PIC implementations (e.g. arm_pic, etc) to use the new 
>> interface.
>>   
>
> While the approach is technically better than #ifdefing things away, 
> this patch would really cause us to drift too far from the qemu 
> codebase, making the eventual merge back (and any intervening merges 
> to kvm) much more difficult.  We also have no guarantee that upstream 
> qemu will accept this change.
>
> If we could get this accepted into upstream qemu, and then merge 
> qemu-devel, that would resolve these issues.

The devices are already written to take a set_irq function.  Instead of 
hijacking the emulated PIC device, I think it would be better if in 
pc.c, we just conditionally created our PIC device that reflected to the 
hypervisor and passed the appropriate function to the emulated hardware.

Otherwise, to support all the other architectures, there's going to be a 
lot of modifications.

Then again, are we really positive that we have to move the APIC into 
the kernel?  A lot of things will get much more complicated.

Regards,

Anthony Liguori

> Anthony, you're our qemu expert.  What's your opinion?
>
>
>
>> Thanks!
>>
>> -Greg
>>   
>> ------------------------------------------------------------------------
>>
>> Index: kvm/qemu/vl.h
>> ===================================================================
>> --- kvm.orig/qemu/vl.h
>> +++ kvm/qemu/vl.h
>> @@ -1040,16 +1040,11 @@ ParallelState *parallel_init(int base, i
>>  
>>  /* i8259.c */
>>  
>> -typedef struct PicState2 PicState2;
>> -extern PicState2 *isa_pic;
>> -void pic_set_irq(int irq, int level);
>> -void pic_set_irq_new(void *opaque, int irq, int level);
>> -PicState2 *pic_init(IRQRequestFunc *irq_request, void 
>> *irq_request_opaque);
>> -void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
>> -                          void *alt_irq_opaque);
>> -int pic_read_irq(PicState2 *s);
>> -void pic_update_irq(PicState2 *s);
>> -uint32_t pic_intack_read(PicState2 *s);
>> +#include "pic.h"
>> +
>> +PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque);
>> +void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
>> +              void *alt_irq_opaque);
>>  void pic_info(void);
>>  void irq_info(void);
>>  
>> @@ -1057,7 +1052,6 @@ void irq_info(void);
>>  typedef struct IOAPICState IOAPICState;
>>  
>>  int apic_init(CPUState *env);
>> -int apic_get_interrupt(CPUState *env);
>>  IOAPICState *ioapic_init(void);
>>  void ioapic_set_irq(void *opaque, int vector, int level);
>>  
>> Index: kvm/qemu/hw/i8259.c
>> ===================================================================
>> --- kvm.orig/qemu/hw/i8259.c
>> +++ kvm/qemu/hw/i8259.c
>> @@ -29,6 +29,8 @@
>>  //#define DEBUG_IRQ_LATENCY
>>  //#define DEBUG_IRQ_COUNT
>>  
>> +typedef struct PicState2 PicState2;
>> +
>>  typedef struct PicState {
>>      uint8_t last_irr; /* edge detection */
>>      uint8_t irr; /* interrupt request register */
>> @@ -58,6 +60,7 @@ struct PicState2 {
>>      /* IOAPIC callback support */
>>      SetIRQFunc *alt_irq_func;
>>      void *alt_irq_opaque;
>> +    PIC *base;
>>  };
>>  
>>  #if defined(DEBUG_PIC) || defined (DEBUG_IRQ_COUNT)
>> @@ -133,8 +136,9 @@ static int pic_get_irq(PicState *s)
>>  /* raise irq to CPU if necessary. must be called every time the active
>>     irq may change */
>>  /* XXX: should not export it, but it is needed for an APIC kludge */
>> -void pic_update_irq(PicState2 *s)
>> +static void i8259_update_irq(PIC *pic)
>>  {
>> +    PicState2 *s = (PicState2*)pic->private;
>>      int irq2, irq;
>>  
>>      /* first look at slave pic */
>> @@ -174,9 +178,9 @@ void pic_update_irq(PicState2 *s)
>>  int64_t irq_time[16];
>>  #endif
>>  
>> -void pic_set_irq_new(void *opaque, int irq, int level)
>> +static void i8259_set_irq(PIC *pic, int irq, int level)
>>  {
>> -    PicState2 *s = opaque;
>> +    PicState2 *s = (PicState2*)pic->private;
>>  
>>  #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
>>      if (level != irq_level[irq]) {
>> @@ -199,13 +203,7 @@ void pic_set_irq_new(void *opaque, int i
>>      /* used for IOAPIC irqs */
>>      if (s->alt_irq_func)
>>          s->alt_irq_func(s->alt_irq_opaque, irq, level);
>> -    pic_update_irq(s);
>> -}
>> -
>> -/* obsolete function */
>> -void pic_set_irq(int irq, int level)
>> -{
>> -    pic_set_irq_new(isa_pic, irq, level);
>> +    pic_update_irq(pic);
>>  }
>>  
>>  /* acknowledge interrupt 'irq' */
>> @@ -231,8 +229,9 @@ static inline void pic_intack(PicState *
>>      }
>>  }
>>  
>> -int pic_read_irq(PicState2 *s)
>> +static int i8259_read_irq(PIC *pic)
>>  {
>> +    PicState2 *s = (PicState2*)pic->private;
>>      int irq, irq2, intno;
>>  
>>      irq = pic_get_irq(&s->pics[0]);
>> @@ -256,7 +255,7 @@ int pic_read_irq(PicState2 *s)
>>          irq = 7;
>>          intno = s->pics[0].irq_base + irq;
>>      }
>> -    pic_update_irq(s);
>> +    pic_update_irq(pic);
>>           #ifdef DEBUG_IRQ_LATENCY
>>      printf("IRQ%d latency=%0.3fus\n", @@ -333,23 +332,23 @@ static 
>> void pic_ioport_write(void *opaqu
>>                      s->isr &= ~(1 << irq);
>>                      if (cmd == 5)
>>                          s->priority_add = (irq + 1) & 7;
>> -                    pic_update_irq(s->pics_state);
>> +                    pic_update_irq(s->pics_state->base);
>>                  }
>>                  break;
>>              case 3:
>>                  irq = val & 7;
>>                  s->isr &= ~(1 << irq);
>> -                pic_update_irq(s->pics_state);
>> +                pic_update_irq(s->pics_state->base);
>>                  break;
>>              case 6:
>>                  s->priority_add = (val + 1) & 7;
>> -                pic_update_irq(s->pics_state);
>> +                pic_update_irq(s->pics_state->base);
>>                  break;
>>              case 7:
>>                  irq = val & 7;
>>                  s->isr &= ~(1 << irq);
>>                  s->priority_add = (irq + 1) & 7;
>> -                pic_update_irq(s->pics_state);
>> +                pic_update_irq(s->pics_state->base);
>>                  break;
>>              default:
>>                  /* no operation */
>> @@ -361,7 +360,7 @@ static void pic_ioport_write(void *opaqu
>>          case 0:
>>              /* normal mode */
>>              s->imr = val;
>> -            pic_update_irq(s->pics_state);
>> +            pic_update_irq(s->pics_state->base);
>>              break;
>>          case 1:
>>              s->irq_base = val & 0xf8;
>> @@ -396,10 +395,10 @@ static uint32_t pic_poll_read (PicState 
>>          s->irr &= ~(1 << ret);
>>          s->isr &= ~(1 << ret);
>>          if (addr1 >> 7 || ret != 2)
>> -            pic_update_irq(s->pics_state);
>> +            pic_update_irq(s->pics_state->base);
>>      } else {
>>          ret = 0x07;
>> -        pic_update_irq(s->pics_state);
>> +        pic_update_irq(s->pics_state->base);
>>      }
>>  
>>      return ret;
>> @@ -434,8 +433,9 @@ static uint32_t pic_ioport_read(void *op
>>  
>>  /* memory mapped interrupt status */
>>  /* XXX: may be the same than pic_read_irq() */
>> -uint32_t pic_intack_read(PicState2 *s)
>> +static uint32_t i8259_intack_read(PIC *pic)
>>  {
>> +    PicState2 *s = (PicState2*)pic->private;
>>      int ret;
>>  
>>      ret = pic_poll_read(&s->pics[0], 0x00);
>> @@ -518,16 +518,14 @@ static void pic_init1(int io_addr, int e
>>      qemu_register_reset(pic_reset, s);
>>  }
>>  
>> -void pic_info(void)
>> +static void i8259_info(PIC *pic)
>>  {
>>      int i;
>> +    PicState2 *s2 = (PicState2*)pic->private;
>>      PicState *s;
>>      -    if (!isa_pic)
>> -        return;
>> -
>>      for(i=0;i<2;i++) {
>> -        s = &isa_pic->pics[i];
>> +        s = &s2->pics[i];
>>          term_printf("pic%d: irr=%02x imr=%02x isr=%02x hprio=%d 
>> irq_base=%02x rr_sel=%d elcr=%02x fnm=%d\n",
>>                      i, s->irr, s->imr, s->isr, s->priority_add, 
>>                      s->irq_base, s->read_reg_select, s->elcr, @@ 
>> -535,7 +533,7 @@ void pic_info(void)
>>      }
>>  }
>>  
>> -void irq_info(void)
>> +static void i8259_stat(PIC *pic)
>>  {
>>  #ifndef DEBUG_IRQ_COUNT
>>      term_printf("irq statistic code not compiled.\n");
>> @@ -552,12 +550,38 @@ void irq_info(void)
>>  #endif
>>  }
>>  
>> -PicState2 *pic_init(IRQRequestFunc *irq_request, void 
>> *irq_request_opaque)
>> +void pic_info(void)
>>  {
>> -    PicState2 *s;
>> -    s = qemu_mallocz(sizeof(PicState2));
>> -    if (!s)
>> +    isa_pic->info(isa_pic);
>> +}
>> +
>> +void irq_info(void)
>> +{
>> +    isa_pic->stat(isa_pic);
>> +}
>> +
>> +PIC *pic_init(IRQRequestFunc *irq_request, void *irq_request_opaque)
>> +{
>> +    PIC *pic = qemu_mallocz(sizeof(PIC));
>> +    if (!pic)
>> +    return NULL;
>> +
>> +    pic->set         = i8259_set_irq;
>> +    pic->read        = i8259_read_irq;
>> +    pic->intack_read = i8259_intack_read;
>> +    pic->update      = i8259_update_irq;
>> +    pic->info        = i8259_info;
>> +    pic->stat        = i8259_stat;
>> +
>> +    pic->private = qemu_mallocz(sizeof(PicState2));
>> +    if (!pic->private) {
>> +    qemu_free(pic);
>>          return NULL;
>> +    }
>> +
>> +    PicState2 *s = (PicState2*)pic->private;
>> +
>> +    s->base = pic;
>>      pic_init1(0x20, 0x4d0, &s->pics[0]);
>>      pic_init1(0xa0, 0x4d1, &s->pics[1]);
>>      s->pics[0].elcr_mask = 0xf8;
>> @@ -566,12 +590,14 @@ PicState2 *pic_init(IRQRequestFunc *irq_
>>      s->irq_request_opaque = irq_request_opaque;
>>      s->pics[0].pics_state = s;
>>      s->pics[1].pics_state = s;
>> -    return s;
>> +    return pic;
>>  }
>>  
>> -void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc *alt_irq_func,
>> +void pic_set_alt_irq_func(PIC *pic, SetIRQFunc *alt_irq_func,
>>                            void *alt_irq_opaque)
>>  {
>> +    PicState2 *s = (PicState2*)pic->private;
>> +
>>      s->alt_irq_func = alt_irq_func;
>>      s->alt_irq_opaque = alt_irq_opaque;
>>  }
>> Index: kvm/qemu/hw/ide.c
>> ===================================================================
>> --- kvm.orig/qemu/hw/ide.c
>> +++ kvm/qemu/hw/ide.c
>> @@ -2190,7 +2190,7 @@ void isa_ide_init(int iobase, int iobase
>>      if (!ide_state)
>>          return;
>>      -    ide_init2(ide_state, hd0, hd1, pic_set_irq_new, isa_pic, irq);
>> +    ide_init2(ide_state, hd0, hd1, (SetIRQFunc *)isa_pic->set, 
>> isa_pic, irq);
>>      ide_init_ioport(ide_state, iobase, iobase2);
>>  }
>>  
>> @@ -2617,9 +2617,9 @@ void pci_piix_ide_init(PCIBus *bus, Bloc
>>                             PCI_ADDRESS_SPACE_IO, bmdma_map);
>>  
>>      ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
>> -              pic_set_irq_new, isa_pic, 14);
>> +              (SetIRQFunc *)isa_pic->set, isa_pic, 14);
>>      ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
>> -              pic_set_irq_new, isa_pic, 15);
>> +              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
>>      ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
>>      ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
>>  
>> @@ -2656,9 +2656,9 @@ void pci_piix3_ide_init(PCIBus *bus, Blo
>>                             PCI_ADDRESS_SPACE_IO, bmdma_map);
>>  
>>      ide_init2(&d->ide_if[0], hd_table[0], hd_table[1],
>> -              pic_set_irq_new, isa_pic, 14);
>> +             (SetIRQFunc *)isa_pic->set, isa_pic, 14);
>>      ide_init2(&d->ide_if[2], hd_table[2], hd_table[3],
>> -              pic_set_irq_new, isa_pic, 15);
>> +              (SetIRQFunc *)isa_pic->set, isa_pic, 15);
>>      ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
>>      ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
>>  
>> Index: kvm/qemu/hw/pc.c
>> ===================================================================
>> --- kvm.orig/qemu/hw/pc.c
>> +++ kvm/qemu/hw/pc.c
>> @@ -89,18 +89,7 @@ void cpu_smm_update(CPUState *env)
>>  /* IRQ handling */
>>  int cpu_get_pic_interrupt(CPUState *env)
>>  {
>> -    int intno;
>> -
>> -    intno = apic_get_interrupt(env);
>> -    if (intno >= 0) {
>> -        /* set irq request if a PIC irq is still pending */
>> -        /* XXX: improve that */
>> -        pic_update_irq(isa_pic); -        return intno;
>> -    }
>> -    /* read the irq from the PIC */
>> -    intno = pic_read_irq(isa_pic);
>> -    return intno;
>> +    return apic_read_irq(env);
>>  }
>>  
>>  static void pic_irq_request(void *opaque, int level)
>> @@ -679,7 +668,7 @@ static void pc_init1(int ram_size, int v
>>  
>>      for(i = 0; i < MAX_SERIAL_PORTS; i++) {
>>          if (serial_hds[i]) {
>> -            serial_init(&pic_set_irq_new, isa_pic,
>> +            serial_init((SetIRQFunc *)isa_pic->set, isa_pic,
>>                          serial_io[i], serial_irq[i], serial_hds[i]);
>>          }
>>      }
>> Index: kvm/qemu/vl.c
>> ===================================================================
>> --- kvm.orig/qemu/vl.c
>> +++ kvm/qemu/vl.c
>> @@ -186,7 +186,7 @@ int time_drift_fix = 1;
>>  /* x86 ISA bus support */
>>  
>>  target_phys_addr_t isa_mem_base = 0;
>> -PicState2 *isa_pic;
>> +PIC *isa_pic;
>>  
>>  uint32_t default_ioport_readb(void *opaque, uint32_t address)
>>  {
>> Index: kvm/qemu/hw/apic.c
>> ===================================================================
>> --- kvm.orig/qemu/hw/apic.c
>> +++ kvm/qemu/hw/apic.c
>> @@ -84,6 +84,7 @@ typedef struct APICState {
>>      uint32_t initial_count;
>>      int64_t initial_count_load_time, next_time;
>>      QEMUTimer *timer;
>> +    CPUState *env;
>>  } APICState;
>>  
>>  struct IOAPICState {
>> @@ -235,9 +236,9 @@ static void apic_bus_deliver(const uint3
>>                   apic_set_irq(apic_iter, vector_num, trigger_mode) );
>>  }
>>  
>> -void cpu_set_apic_base(CPUState *env, uint64_t val)
>> +static void _apic_set_base(APIC *apic, uint64_t val)
>>  {
>> -    APICState *s = env->apic_state;
>> +    APICState *s = (APICState*)apic->private;
>>  #ifdef DEBUG_APIC
>>      printf("cpu_set_apic_base: %016" PRIx64 "\n", val);
>>  #endif
>> @@ -246,30 +247,30 @@ void cpu_set_apic_base(CPUState *env, ui
>>      /* if disabled, cannot be enabled again */
>>      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
>>          s->apicbase &= ~MSR_IA32_APICBASE_ENABLE;
>> -        env->cpuid_features &= ~CPUID_APIC;
>> +        s->env->cpuid_features &= ~CPUID_APIC;
>>          s->spurious_vec &= ~APIC_SV_ENABLE;
>>      }
>>  }
>>  
>> -uint64_t cpu_get_apic_base(CPUState *env)
>> +static uint64_t _apic_get_base(APIC *apic)
>>  {
>> -    APICState *s = env->apic_state;
>> +    APICState *s = (APICState*)apic->private;
>>  #ifdef DEBUG_APIC
>>      printf("cpu_get_apic_base: %016" PRIx64 "\n", 
>> (uint64_t)s->apicbase);
>>  #endif
>>      return s->apicbase;
>>  }
>>  
>> -void cpu_set_apic_tpr(CPUX86State *env, uint8_t val)
>> +static void _apic_set_tpr(APIC *apic, uint8_t val)
>>  {
>> -    APICState *s = env->apic_state;
>> +    APICState *s = (APICState*)apic->private;
>>      s->tpr = (val & 0x0f) << 4;
>>      apic_update_irq(s);
>>  }
>>  
>> -uint8_t cpu_get_apic_tpr(CPUX86State *env)
>> +static uint8_t _apic_get_tpr(APIC *apic)
>>  {
>> -    APICState *s = env->apic_state;
>> +    APICState *s = (APICState*)apic->private;
>>      return s->tpr >> 4;
>>  }
>>  
>> @@ -460,28 +461,34 @@ static void apic_deliver(APICState *s, u
>>                       trigger_mode);
>>  }
>>  
>> -int apic_get_interrupt(CPUState *env)
>> +static int _apic_read_irq(APIC *apic)
>>  {
>> -    APICState *s = env->apic_state;
>> +    APICState *s = (APICState*)apic->private;
>>      int intno;
>>  
>>      /* if the APIC is installed or enabled, we let the 8259 handle the
>>         IRQs */
>> -    if (!s)
>> -        return -1;
>> -    if (!(s->spurious_vec & APIC_SV_ENABLE))
>> -        return -1;
>> +    if (!s || !(s->spurious_vec & APIC_SV_ENABLE))
>> +    goto use_pic;
>>           /* XXX: spurious IRQ handling */
>>      intno = get_highest_priority_int(s->irr);
>>      if (intno < 0)
>> -        return -1;
>> +    goto use_pic;
>>      if (s->tpr && intno <= s->tpr)
>>          return s->spurious_vec & 0xff;
>>      reset_bit(s->irr, intno);
>>      set_bit(s->isr, intno);
>>      apic_update_irq(s);
>> +    +    /* set irq request if a PIC irq is still pending */
>> +    /* XXX: improve that */
>> +    pic_update_irq(isa_pic); +
>>      return intno;
>> +
>> + use_pic:
>> +    return pic_read_irq(isa_pic);
>>  }
>>  
>>  static uint32_t apic_get_current_count(APICState *s)
>> @@ -563,7 +570,7 @@ static uint32_t apic_mem_readl(void *opa
>>      env = cpu_single_env;
>>      if (!env)
>>          return 0;
>> -    s = env->apic_state;
>> +    s = (APICState*)((APIC*)(env->apic))->private;
>>  
>>      index = (addr >> 4) & 0xff;
>>      switch(index) {
>> @@ -640,7 +647,7 @@ static void apic_mem_writel(void *opaque
>>      env = cpu_single_env;
>>      if (!env)
>>          return;
>> -    s = env->apic_state;
>> +    s = (APICState*)((APIC*)(env->apic))->private;
>>  
>>  #ifdef DEBUG_APIC
>>      printf("APIC write: %08x = %08x\n", (uint32_t)addr, val);
>> @@ -806,14 +813,31 @@ static CPUWriteMemoryFunc *apic_mem_writ
>>  
>>  int apic_init(CPUState *env)
>>  {
>> +    APIC      *apic;
>>      APICState *s;
>>  
>>      if (last_apic_id >= MAX_APICS)
>>          return -1;
>> + +    apic = qemu_mallocz(sizeof(APIC));
>> +    if(!apic)
>> +    return -1;
>> +
>>      s = qemu_mallocz(sizeof(APICState));
>> -    if (!s)
>> +    if (!s) {
>> +    qemu_free(apic);
>>          return -1;
>> -    env->apic_state = s;
>> +    }
>> +    env->apic = apic;
>> +    s->env = env;
>> +
>> +    apic->read_irq = _apic_read_irq;
>> +    apic->set_base = _apic_set_base;
>> +    apic->get_base = _apic_get_base;
>> +    apic->set_tpr  = _apic_set_tpr;
>> +    apic->get_tpr  = _apic_get_tpr;
>> +
>> +    apic->private = s;
>>      apic_init_ipi(s);
>>      s->id = last_apic_id++;
>>      s->cpu_env = env;
>> Index: kvm/qemu/target-i386/cpu.h
>> ===================================================================
>> --- kvm.orig/qemu/target-i386/cpu.h
>> +++ kvm/qemu/target-i386/cpu.h
>> @@ -553,7 +553,7 @@ typedef struct CPUX86State {
>>  
>>      /* in order to simplify APIC support, we leave this pointer to the
>>         user */
>> -    struct APICState *apic_state;
>> +    void *apic;
>>  } CPUX86State;
>>  
>>  CPUX86State *cpu_x86_init(void);
>> @@ -651,12 +651,6 @@ void cpu_x86_set_a20(CPUX86State *env, i
>>  
>>  uint64_t cpu_get_tsc(CPUX86State *env);
>>  
>> -void cpu_set_apic_base(CPUX86State *env, uint64_t val);
>> -uint64_t cpu_get_apic_base(CPUX86State *env);
>> -void cpu_set_apic_tpr(CPUX86State *env, uint8_t val);
>> -#ifndef NO_CPU_IO_DEFS
>> -uint8_t cpu_get_apic_tpr(CPUX86State *env);
>> -#endif
>>  void cpu_smm_update(CPUX86State *env);
>>  
>>  /* will be suppressed */
>> Index: kvm/qemu/qemu-kvm.c
>> ===================================================================
>> --- kvm.orig/qemu/qemu-kvm.c
>> +++ kvm/qemu/qemu-kvm.c
>> @@ -7,6 +7,7 @@
>>  #include "exec.h"
>>  
>>  #include "qemu-kvm.h"
>> +#include "pic.h"
>>  #include <kvmctl.h>
>>  #include <string.h>
>>  
>> @@ -215,9 +216,9 @@ static void load_regs(CPUState *env)
>>      sregs.cr3 = env->cr[3];
>>      sregs.cr4 = env->cr[4];
>>  
>> -    sregs.apic_base = cpu_get_apic_base(env);
>> +    sregs.apic_base = apic_get_base(env);
>>      sregs.efer = env->efer;
>> -    sregs.cr8 = cpu_get_apic_tpr(env);
>> +    sregs.cr8 = apic_get_tpr(env);
>>  
>>      kvm_set_sregs(kvm_context, 0, &sregs);
>>  
>> @@ -298,7 +299,7 @@ static void save_regs(CPUState *env)
>>      env->cr[3] = sregs.cr3;
>>      env->cr[4] = sregs.cr4;
>>  
>> -    cpu_set_apic_base(env, sregs.apic_base);
>> +    apic_set_base(env, sregs.apic_base);
>>  
>>      env->efer = sregs.efer;
>>      //cpu_set_apic_tpr(env, sregs.cr8);
>> @@ -403,7 +404,7 @@ static void post_kvm_run(void *opaque, s
>>      env->eflags = (kvm_run->if_flag) ? env->eflags | 
>> IF_MASK:env->eflags & ~IF_MASK;
>>      env->ready_for_interrupt_injection = 
>> kvm_run->ready_for_interrupt_injection;
>>      //cpu_set_apic_tpr(env, kvm_run->cr8);
>> -    cpu_set_apic_base(env, kvm_run->apic_base);
>> +    apic_set_base(env, kvm_run->apic_base);
>>  }
>>  
>>  static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
>> @@ -411,7 +412,7 @@ static void pre_kvm_run(void *opaque, st
>>      CPUState **envs = opaque, *env;
>>      env = envs[0];
>>  
>> -    kvm_run->cr8 = cpu_get_apic_tpr(env);
>> +    kvm_run->cr8 = apic_get_tpr(env);
>>  }
>>  
>>  void kvm_load_registers(CPUState *env)
>> Index: kvm/qemu/target-i386/helper.c
>> ===================================================================
>> --- kvm.orig/qemu/target-i386/helper.c
>> +++ kvm/qemu/target-i386/helper.c
>> @@ -18,6 +18,7 @@
>>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  
>> 02111-1307  USA
>>   */
>>  #include "exec.h"
>> +#include "pic.h"
>>  #ifdef USE_KVM
>>  extern int kvm_allowed;
>>  #endif
>> @@ -2650,7 +2651,7 @@ void helper_movl_crN_T0(int reg)
>>          cpu_x86_update_cr4(env, T0);
>>          break;
>>      case 8:
>> -        cpu_set_apic_tpr(env, T0);
>> +        apic_set_tpr(env, T0);
>>          break;
>>      default:
>>          env->cr[reg] = T0;
>> @@ -2708,7 +2709,7 @@ void helper_wrmsr(void)
>>          env->sysenter_eip = val;
>>          break;
>>      case MSR_IA32_APICBASE:
>> -        cpu_set_apic_base(env, val);
>> +        apic_set_base(env, val);
>>          break;
>>      case MSR_EFER:
>>          {
>> @@ -2772,7 +2773,7 @@ void helper_rdmsr(void)
>>          val = env->sysenter_eip;
>>          break;
>>      case MSR_IA32_APICBASE:
>> -        val = cpu_get_apic_base(env);
>> +        val = apic_get_base(env);
>>          break;
>>      case MSR_EFER:
>>          val = env->efer;
>> Index: kvm/qemu/target-i386/op.c
>> ===================================================================
>> --- kvm.orig/qemu/target-i386/op.c
>> +++ kvm/qemu/target-i386/op.c
>> @@ -20,6 +20,7 @@
>>  
>>  #define ASM_SOFTMMU
>>  #include "exec.h"
>> +#include "pic.h"
>>  
>>  /* n must be a constant to be efficient */
>>  static inline target_long lshift(target_long x, int n)
>> @@ -1246,7 +1247,7 @@ void OPPROTO op_movl_crN_T0(void)
>>  #if !defined(CONFIG_USER_ONLY)  void OPPROTO op_movtl_T0_cr8(void)
>>  {
>> -    T0 = cpu_get_apic_tpr(env);
>> +    T0 = apic_get_tpr(env);
>>  }
>>  #endif
>>  
>> Index: kvm/qemu/pic.h
>> ===================================================================
>> --- /dev/null
>> +++ kvm/qemu/pic.h
>> @@ -0,0 +1,52 @@
>> +/************************************************************************ 
>>
>> + * pic.h: Provides an indirection layer for the (A)PIC which allows 
>> + *        dynamic substituion of the interrupt handling code.  This
>> + *        will allow us to selectively chose the KVM in-kernel apic 
>> + *        emulation vs the QEMU user-space apic emulation.  Support
>> + *        for both is key to allow "--no-kvm" type operation to 
>> continue
>> + *        working even after the in-kernel code is deployed.
>> + *
>> + *        Written by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
>> + *
>> + 
>> ***********************************************************************/
>> +
>> +#ifndef QEMU_PIC_H
>> +#define QEMU_PIC_H
>> +
>> +typedef struct PIC_tag
>> +{
>> +    void     (*set)(struct PIC_tag *this, int irq, int level);
>> +    int      (*read)(struct PIC_tag *this);
>> +    uint32_t (*intack_read)(struct PIC_tag *this);
>> +    void     (*update)(struct PIC_tag *this);
>> +    void     (*info)(struct PIC_tag *this);
>> +    void     (*stat)(struct PIC_tag *this);
>> +    +    void *private;
>> +}PIC;
>> +
>> +extern PIC *isa_pic;
>> +#define pic_set_irq(irq, level)          isa_pic->set(isa_pic, irq, 
>> level)
>> +#define pic_set_irq_new(pic, irq, level) pic->set(pic, irq, level)
>> +#define pic_read_irq(pic)                pic->read(pic)
>> +#define pic_intack_read(pic)             pic->intack_read(pic)
>> +#define pic_update_irq(pic)              pic->update(pic)
>> +
>> +typedef struct APIC_tag
>> +{
>> +    int      (*read_irq)(void *this);
>> +    void     (*set_base)(void *this, uint64_t val);
>> +    uint64_t (*get_base)(void *this);
>> +    void     (*set_tpr)(void *this, uint8_t val);
>> +    uint8_t  (*get_tpr)(void *this);
>> +
>> +    void *private;
>> +}APIC;
>> +
>> +#define apic_read_irq(env)      
>> ((APIC*)(env->apic))->read_irq(env->apic)
>> +#define apic_set_base(env, val) 
>> ((APIC*)(env->apic))->set_base(env->apic, val)
>> +#define apic_get_base(env)      
>> ((APIC*)(env->apic))->get_base(env->apic)
>> +#define apic_set_tpr(env, val)  
>> ((APIC*)(env->apic))->set_tpr(env->apic, val)
>> +#define apic_get_tpr(env)       
>> ((APIC*)(env->apic))->get_tpr(env->apic)
>> +
>> +#endif /* QEMU_PIC_H */
>>   
>> ------------------------------------------------------------------------
>>
>> ------------------------------------------------------------------------- 
>>
>> Take Surveys. Earn Cash. Influence the Future of IT
>> Join SourceForge.net's Techsay panel and you'll get the chance to 
>> share your
>> opinions on IT & business topics through brief surveys-and earn cash
>> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV 
>>
>> ------------------------------------------------------------------------
>>
>> _______________________________________________
>> kvm-devel mailing list
>> kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>>   
>
>


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]         ` <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 14:39           ` Avi Kivity
       [not found]             ` <4613B89F.8090806-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-04 17:51           ` Gregory Haskins
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 14:39 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>
> Then again, are we really positive that we have to move the APIC into 
> the kernel?  A lot of things will get much more complicated.

The following arguments are in favor:
- allow in-kernel paravirt drivers to interrupt the guest without going 
through qemu (which involves a signal and some complexity)
- same for guest SMP IPI
- reduced overhead for a much-loved hardware component (especially on 
Windows, where one regularly sees 100K apic updates a second)

The strength of these arguments increase as vmexit overhead decreases 
with improving hardware.

Of course, pushing such a piece of misery into the kernel where it can 
cause much pain should not be done lightly.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]             ` <4613B89F.8090806-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 14:55               ` Anthony Liguori
       [not found]                 ` <4613BC6B.1070708-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 14:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>>
>> Then again, are we really positive that we have to move the APIC into 
>> the kernel?  A lot of things will get much more complicated.
>
> The following arguments are in favor:
> - allow in-kernel paravirt drivers to interrupt the guest without 
> going through qemu (which involves a signal and some complexity)
> - same for guest SMP IPI
> - reduced overhead for a much-loved hardware component (especially on 
> Windows, where one regularly sees 100K apic updates a second)

This is for the TPR right?  VT has special logic to handle TPR 
virtualization doesn't it?  I thought SVM did too...

> The strength of these arguments increase as vmexit overhead decreases 
> with improving hardware.

Do you mean, the strength of these arguments decrease? :-)

>
> Of course, pushing such a piece of misery into the kernel where it can 
> cause much pain should not be done lightly.

Right, so some arguments against:
- The cost to go to userspace is small compared to the cost of the exit 
itself
- Maintaining hardware emulation in two places is going to suck
- The need to have in-kernel backends for paravirt drivers should be 
carefully considered.

An in-kernel backend is certainly not necessary for disks.  If we can't 
drive network traffic fast enough from userspace, perhaps we should 
consider improving the userspace interfaces.

Originating disk IO from userspace is useful for support interesting 
storage formats.  Network IO from userspace is also interesting to 
support things like slirp.

Regards,

Anthony Liguori


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                 ` <4613BC6B.1070708-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 15:06                   ` Avi Kivity
       [not found]                     ` <4613BF07.50606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 15:06 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>>
>>> Then again, are we really positive that we have to move the APIC 
>>> into the kernel?  A lot of things will get much more complicated.
>>
>> The following arguments are in favor:
>> - allow in-kernel paravirt drivers to interrupt the guest without 
>> going through qemu (which involves a signal and some complexity)
>> - same for guest SMP IPI
>> - reduced overhead for a much-loved hardware component (especially on 
>> Windows, where one regularly sees 100K apic updates a second)
>
> This is for the TPR right?  VT has special logic to handle TPR 
> virtualization doesn't it?  I thought SVM did too...
>

Yes, the TPR.  Both VT and SVM virtualize CR8 in 64-bit mode.  SVM also 
supports CR8 in 32-bit mode through a nwe instruction encoding, but 
nobody uses that to my knowledge.  Maybe some brave soul can hack kvm to 
patch the new instruction in place of the mmio instruction Windows uses 
to bang on the tpr.


>> The strength of these arguments increase as vmexit overhead decreases 
>> with improving hardware.
>
> Do you mean, the strength of these arguments decrease? :-)

Nope, increase.  Currently vmexit time dominates the privilege switch 
time to user mode.  If vmexit time is reduced to be on par with syscall 
time, you get 50% saving (assuming tpr emulation is near-free, which it 
can be easily made to be).

>
>>
>> Of course, pushing such a piece of misery into the kernel where it 
>> can cause much pain should not be done lightly.
>
> Right, so some arguments against:
> - The cost to go to userspace is small compared to the cost of the 
> exit itself

True now.  False tomorrow (for some value of tomorrow).

> - Maintaining hardware emulation in two places is going to suck

True now.  Even truer tomorrow.

> - The need to have in-kernel backends for paravirt drivers should be 
> carefully considered.
>
> An in-kernel backend is certainly not necessary for disks.  

Agree.

> If we can't drive network traffic fast enough from userspace, perhaps 
> we should consider improving the userspace interfaces.

I'm open to suggestions.  With an in-kernel backend, we can have 
copyless transmit and single-copy receive.  With the current APIs, we 
can have single-copy transmit and 3-copy receive.  I doubt we could 
improve it beyond (1, 2) without major hacking on non-kvm kernel interfaces.

>
> Originating disk IO from userspace is useful for support interesting 
> storage formats.  Network IO from userspace is also interesting to 
> support things like slirp. 

Right.  Nobody's suggesting removing it, however.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                     ` <4613BF07.50606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 15:36                       ` Nakajima, Jun
       [not found]                         ` <8FFF7E42E93CC646B632AB40643802A8025B9580-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2007-04-04 15:51                       ` Anthony Liguori
  1 sibling, 1 reply; 81+ messages in thread
From: Nakajima, Jun @ 2007-04-04 15:36 UTC (permalink / raw)
  To: Avi Kivity, Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> Anthony Liguori wrote:
>>>> 
>>>> Then again, are we really positive that we have to move the APIC
>>>> into the kernel?  A lot of things will get much more complicated.
>>> 
>>> The following arguments are in favor:
>>> - allow in-kernel paravirt drivers to interrupt the guest without
>>> going through qemu (which involves a signal and some complexity)
>>> - same for guest SMP IPI
>>> - reduced overhead for a much-loved hardware component (especially
>>> on Windows, where one regularly sees 100K apic updates a second)
>> 
>> This is for the TPR right?  VT has special logic to handle TPR
>> virtualization doesn't it?  I thought SVM did too...
>> 
> 
> Yes, the TPR.  Both VT and SVM virtualize CR8 in 64-bit mode.  SVM
> also supports CR8 in 32-bit mode through a nwe instruction encoding,
> but 
> nobody uses that to my knowledge.  Maybe some brave soul can hack kvm
> to patch the new instruction in place of the mmio instruction Windows
> uses 
> to bang on the tpr.

Actually VT has virtual TPR support that does not require CR8. We
submitted a patch for Xen. Please see
http://lists.xensource.com/archives/html/xen-devel/2007-03/msg00993.html
The spec should be available soon. We are working on a patch for KVM.

Jun
---
Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                         ` <8FFF7E42E93CC646B632AB40643802A8025B9580-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-04 15:47                           ` Dor Laor
       [not found]                             ` <64F9B87B6B770947A9F8391472E032160B318E96-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Dor Laor @ 2007-04-04 15:47 UTC (permalink / raw)
  To: Nakajima, Jun, Avi Kivity, Anthony Liguori
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>>> This is for the TPR right?  VT has special logic to handle TPR
>>> virtualization doesn't it?  I thought SVM did too...
>>>
>>
>> Yes, the TPR.  Both VT and SVM virtualize CR8 in 64-bit mode.  SVM
>> also supports CR8 in 32-bit mode through a nwe instruction encoding,
>> but
>> nobody uses that to my knowledge.  Maybe some brave soul can hack kvm
>> to patch the new instruction in place of the mmio instruction Windows
>> uses
>> to bang on the tpr.
>
>Actually VT has virtual TPR support that does not require CR8. We
>submitted a patch for Xen. Please see
>http://lists.xensource.com/archives/html/xen-devel/2007-03/msg00993.htm
l
>The spec should be available soon. We are working on a patch for KVM.
>
>Jun

That's superb! Windows gives us really bad time with all of those TPR
accesses. Because of that we currently prefer of using the non-acpi HAL.

On what version on chip is this feature supported?

This pushes towards in kernel apic too. Can't see how we avoid it.

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                     ` <4613BF07.50606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-04 15:36                       ` Nakajima, Jun
@ 2007-04-04 15:51                       ` Anthony Liguori
       [not found]                         ` <4613C993.9020405-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 15:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> Anthony Liguori wrote:
>>>>
>>>> Then again, are we really positive that we have to move the APIC 
>>>> into the kernel?  A lot of things will get much more complicated.
>>>
>>> The following arguments are in favor:
>>> - allow in-kernel paravirt drivers to interrupt the guest without 
>>> going through qemu (which involves a signal and some complexity)
>>> - same for guest SMP IPI
>>> - reduced overhead for a much-loved hardware component (especially 
>>> on Windows, where one regularly sees 100K apic updates a second)
>>
>> This is for the TPR right?  VT has special logic to handle TPR 
>> virtualization doesn't it?  I thought SVM did too...
>>
>
> Yes, the TPR.  Both VT and SVM virtualize CR8 in 64-bit mode.  SVM 
> also supports CR8 in 32-bit mode through a nwe instruction encoding, 
> but nobody uses that to my knowledge.  Maybe some brave soul can hack 
> kvm to patch the new instruction in place of the mmio instruction 
> Windows uses to bang on the tpr.

It seems like that shouldn't be too hard assuming that the MMIO 
instructions are <= the new CR8 instruction.  It would require knowing 
where the TPR is mapped into memory of course.

If we do this, then we can probably just handle the TPR as a special 
case anyway and not bother returning to userspace when the TPR is 
updated through MMIO.  That saves the round trip without adding 
emulation complexity.

>> If we can't drive network traffic fast enough from userspace, perhaps 
>> we should consider improving the userspace interfaces.
>
> I'm open to suggestions.  With an in-kernel backend, we can have 
> copyless transmit and single-copy receive.  With the current APIs, we 
> can have single-copy transmit and 3-copy receive.  I doubt we could 
> improve it beyond (1, 2) without major hacking on non-kvm kernel 
> interfaces.

I don't know enough about networking to speak intelligently here so I'll 
just defer to the experts :-)

Regards,

Anthony Liguori

>>
>> Originating disk IO from userspace is useful for support interesting 
>> storage formats.  Network IO from userspace is also interesting to 
>> support things like slirp. 
>
> Right.  Nobody's suggesting removing it, however.
>


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                             ` <64F9B87B6B770947A9F8391472E032160B318E96-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
@ 2007-04-04 15:53                               ` Anthony Liguori
       [not found]                                 ` <4613C9EE.5030600-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 15:53 UTC (permalink / raw)
  To: Dor Laor; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Avi Kivity

Dor Laor wrote:
>>>> This is for the TPR right?  VT has special logic to handle TPR
>>>> virtualization doesn't it?  I thought SVM did too...
>>>>
>>>>         
>>> Yes, the TPR.  Both VT and SVM virtualize CR8 in 64-bit mode.  SVM
>>> also supports CR8 in 32-bit mode through a nwe instruction encoding,
>>> but
>>> nobody uses that to my knowledge.  Maybe some brave soul can hack kvm
>>> to patch the new instruction in place of the mmio instruction Windows
>>> uses
>>> to bang on the tpr.
>>>       
>> Actually VT has virtual TPR support that does not require CR8. We
>> submitted a patch for Xen. Please see
>> http://lists.xensource.com/archives/html/xen-devel/2007-03/msg00993.htm
>>     
> l
>   
>> The spec should be available soon. We are working on a patch for KVM.
>>
>> Jun
>>     
>
> That's superb! Windows gives us really bad time with all of those TPR
> accesses. Because of that we currently prefer of using the non-acpi HAL.
>
> On what version on chip is this feature supported?
>
> This pushes towards in kernel apic too. Can't see how we avoid it.
>   

Does it really?  IIUC, we would avoid TPR traps entirely and would just 
need to synchronize the TPR whenever we go down to userspace.

Regards,

Anthony Liguori



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                         ` <4613C993.9020405-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 16:02                           ` Avi Kivity
       [not found]                             ` <4613CC01.1090500-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:02 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>> Maybe some brave soul can hack kvm to patch the new instruction in 
>> place of the mmio instruction Windows uses to bang on the tpr.
>
> It seems like that shouldn't be too hard assuming that the MMIO 
> instructions are <= the new CR8 instruction.  It would require knowing 
> where the TPR is mapped into memory of course.

Well, we know the physical address (some msr) and the virtual mapping.  
But we must be sure that the instruction is only used for setting the 
tpr, and not other registers.

Er, thinking a bit more, cr8 is just 4 bits (and no, not the least 
significant) out of the 8-bit tpr, so it doesn't work without serious 
hackery.

>
> If we do this, then we can probably just handle the TPR as a special 
> case anyway and not bother returning to userspace when the TPR is 
> updated through MMIO.  That saves the round trip without adding 
> emulation complexity.

That means the emulation is split among user space and kernel.  Not 
nice.  One of the advantages of moving the entire thing is that it is at 
least clearly defined.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                 ` <4613C9EE.5030600-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 16:05                                   ` Avi Kivity
       [not found]                                     ` <4613CCD1.2070702-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:05 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>
>>
>> This pushes towards in kernel apic too. Can't see how we avoid it.
>>   
>
> Does it really?  IIUC, we would avoid TPR traps entirely and would 
> just need to synchronize the TPR whenever we go down to userspace.
>

It's a bit more complex than that, as userspace would need to tell the 
kernel the highest priority pending interrupt so that it can program the 
hardware to exit when an interrupt is ready.  However I agree with you 
that in principle we could split the apic emulation between kvm and 
qemu, even with this featurette.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                             ` <4613CC01.1090500-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 16:09                               ` Anthony Liguori
       [not found]                                 ` <4613CDB2.4000903-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 16:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>>> Maybe some brave soul can hack kvm to patch the new instruction in 
>>> place of the mmio instruction Windows uses to bang on the tpr.
>>
>> It seems like that shouldn't be too hard assuming that the MMIO 
>> instructions are <= the new CR8 instruction.  It would require 
>> knowing where the TPR is mapped into memory of course.
>
> Well, we know the physical address (some msr) and the virtual 
> mapping.  But we must be sure that the instruction is only used for 
> setting the tpr, and not other registers.
>
> Er, thinking a bit more, cr8 is just 4 bits (and no, not the least 
> significant) out of the 8-bit tpr, so it doesn't work without serious 
> hackery.

Hrm, so this not nearly as straight forward as I initially hoped :-/

>>
>> If we do this, then we can probably just handle the TPR as a special 
>> case anyway and not bother returning to userspace when the TPR is 
>> updated through MMIO.  That saves the round trip without adding 
>> emulation complexity.
>
> That means the emulation is split among user space and kernel.  Not 
> nice.  One of the advantages of moving the entire thing is that it is 
> at least clearly defined.

It still exists in userspace.  Having the code duplication (especially 
when it's not the same code base) is unfortunate.  Plus, it complicates 
save/restore/migration since now some device state is in the kernel.  It 
further complicates things if you want to make sure that KVM saved 
images are loadable in QEMU (you have to make sure that the device state 
is identical for the kernel and userspace).  Special casing the TPR 
emulation seems like the lesser of two evils to me.

Regards,

Anthony Liguori


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                 ` <4613CDB2.4000903-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 16:19                                   ` Avi Kivity
       [not found]                                     ` <4613D001.3040606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-04 16:23                                   ` Dor Laor
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:19 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>>>
>>> If we do this, then we can probably just handle the TPR as a special 
>>> case anyway and not bother returning to userspace when the TPR is 
>>> updated through MMIO.  That saves the round trip without adding 
>>> emulation complexity.
>>
>> That means the emulation is split among user space and kernel.  Not 
>> nice.  One of the advantages of moving the entire thing is that it is 
>> at least clearly defined.
>
> It still exists in userspace.  Having the code duplication (especially 
> when it's not the same code base) is unfortunate.

This remains true.

> Plus, it complicates save/restore/migration since now some device 
> state is in the kernel.  It further complicates things if you want to 
> make sure that KVM saved images are loadable in QEMU (you have to make 
> sure that the device state is identical for the kernel and userspace).  

You'd just load the kernel state into qemu state, like we do with the 
registers, and use the regular qemu save.  You could turn kernel apic 
emulation on and off during runtime :)

> Special casing the TPR emulation seems like the lesser of two evils to 
> me.

It's not just the tpr.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                     ` <4613CCD1.2070702-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 16:20                                       ` Nakajima, Jun
       [not found]                                         ` <8FFF7E42E93CC646B632AB40643802A8025B962E-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Nakajima, Jun @ 2007-04-04 16:20 UTC (permalink / raw)
  To: Avi Kivity, Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>> 
>>> 
>>> This pushes towards in kernel apic too. Can't see how we avoid it.
>>> 
>> 
>> Does it really?  IIUC, we would avoid TPR traps entirely and would
>> just need to synchronize the TPR whenever we go down to userspace.
>> 
> 
> It's a bit more complex than that, as userspace would need to tell the
> kernel the highest priority pending interrupt so that it can program
> the hardware to exit when an interrupt is ready.  However I agree
> with you that in principle we could split the apic emulation between
> kvm and qemu, even with this featurette.

Most of H/W-virtualization capable processors out there don't support
that feature today. I think the decision (kvm or qemu) should be done
based on performance data. I'm not worried about maintenance issues; the
APIC code is not expected to change frequently. I'm a bit worried about
extra complexity caused by such split, though. 

BTW, I see CPU utilization of qemu is almost always 99% in the top
command when I run kernel build in an x86-64 Linux guest.

Jun
---
Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                 ` <4613CDB2.4000903-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  2007-04-04 16:19                                   ` Avi Kivity
@ 2007-04-04 16:23                                   ` Dor Laor
       [not found]                                     ` <64F9B87B6B770947A9F8391472E032160B318ED4-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Dor Laor @ 2007-04-04 16:23 UTC (permalink / raw)
  To: Anthony Liguori, Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>>>
>>> If we do this, then we can probably just handle the TPR as a special
>>> case anyway and not bother returning to userspace when the TPR is
>>> updated through MMIO.  That saves the round trip without adding
>>> emulation complexity.
>>
>> That means the emulation is split among user space and kernel.  Not
>> nice.  One of the advantages of moving the entire thing is that it is
>> at least clearly defined.
>
>It still exists in userspace.  Having the code duplication (especially
>when it's not the same code base) is unfortunate.  Plus, it complicates
>save/restore/migration since now some device state is in the kernel.
It
>further complicates things if you want to make sure that KVM saved
>images are loadable in QEMU (you have to make sure that the device
state
>is identical for the kernel and userspace).  Special casing the TPR
>emulation seems like the lesser of two evils to me.
>

There are still some nasty issues that caused by running apic in qemu:
 - If you have a PV driver that issued an irq and now it needs another
one,
   You cannot really be sure if the first one was injected. You can hold

   extra state and follow irq injection from qemu but this is ugly.
 - You have to keep sync of tpr and irq injection: suppose apic needs to

   inject an irq, it pops it from the irr and tries to inject it. If it
is 
   done while the VM is in interrupt window closed or irq disables it
will 
   no happen instantly. When the irq would really be injected the tpr
might 
   be different, causing windows irq-not-less-or-equal BSOD.
   This is especially true for injecting several irq at once.

Keeping the apic in the kernel simplifies this with the cost of
maintaining an apic/pic implementation. 

Do you know why Xen guy choose of implementing it in Xen? Why didn't
they rip Qemu implementation?


>Regards,
>
>Anthony Liguori


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <8FFF7E42E93CC646B632AB40643802A8025B962E-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-04 16:27                                           ` Dor Laor
       [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B318EDB-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  2007-04-04 16:32                                           ` Avi Kivity
  1 sibling, 1 reply; 81+ messages in thread
From: Dor Laor @ 2007-04-04 16:27 UTC (permalink / raw)
  To: Nakajima, Jun, Avi Kivity, Anthony Liguori
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>>>>
>>>> This pushes towards in kernel apic too. Can't see how we avoid it.
>>>>
>>>
>>> Does it really?  IIUC, we would avoid TPR traps entirely and would
>>> just need to synchronize the TPR whenever we go down to userspace.
>>>
>>
>> It's a bit more complex than that, as userspace would need to tell
the
>> kernel the highest priority pending interrupt so that it can program
>> the hardware to exit when an interrupt is ready.  However I agree
>> with you that in principle we could split the apic emulation between
>> kvm and qemu, even with this featurette.
>
>Most of H/W-virtualization capable processors out there don't support
>that feature today. I think the decision (kvm or qemu) should be done
>based on performance data. I'm not worried about maintenance issues;
the
>APIC code is not expected to change frequently. I'm a bit worried about
>extra complexity caused by such split, though.
>

I had an in-kernel-apic implementation for KVM and the performance
improvement was insignificant. It is mainly a software engineering
thing.
PV drivers can benefit from APIC in the kernel but again just a mere
improvement.


>BTW, I see CPU utilization of qemu is almost always 99% in the top
>command when I run kernel build in an x86-64 Linux guest.


What does qemu do?

Idle guest hardly consume cpu, especially KVM powered.

>Jun
>---
>Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <8FFF7E42E93CC646B632AB40643802A8025B962E-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2007-04-04 16:27                                           ` Dor Laor
@ 2007-04-04 16:32                                           ` Avi Kivity
       [not found]                                             ` <4613D30E.7030905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:32 UTC (permalink / raw)
  To: Nakajima, Jun; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Nakajima, Jun wrote:
> Most of H/W-virtualization capable processors out there don't support
> that feature today. I think the decision (kvm or qemu) should be done
> based on performance data. I'm not worried about maintenance issues; the
> APIC code is not expected to change frequently. I'm a bit worried about
> extra complexity caused by such split, though. 
>
>   

In principle we could measure the performance cost today with the pv-net 
driver; however it still does a lot of copies which could be eliminated.

> BTW, I see CPU utilization of qemu is almost always 99% in the top
> command when I run kernel build in an x86-64 Linux guest.
>   

Isn't that expected? if your guest image is mostly cached in the host, 
the guest would have nothing to block on.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B318EDB-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
@ 2007-04-04 16:43                                               ` Anthony Liguori
       [not found]                                                 ` <4613D596.7080201-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 16:43 UTC (permalink / raw)
  To: Dor Laor; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Avi Kivity

Dor Laor wrote:
>>>>> This pushes towards in kernel apic too. Can't see how we avoid it.
>>>>>
>>>>>           
>>>> Does it really?  IIUC, we would avoid TPR traps entirely and would
>>>> just need to synchronize the TPR whenever we go down to userspace.
>>>>
>>>>         
>>> It's a bit more complex than that, as userspace would need to tell
>>>       
> the
>   
>>> kernel the highest priority pending interrupt so that it can program
>>> the hardware to exit when an interrupt is ready.  However I agree
>>> with you that in principle we could split the apic emulation between
>>> kvm and qemu, even with this featurette.
>>>       
>> Most of H/W-virtualization capable processors out there don't support
>> that feature today. I think the decision (kvm or qemu) should be done
>> based on performance data. I'm not worried about maintenance issues;
>>     
> the
>   
>> APIC code is not expected to change frequently. I'm a bit worried about
>> extra complexity caused by such split, though.
>>
>>     
>
> I had an in-kernel-apic implementation for KVM and the performance
> improvement was insignificant. It is mainly a software engineering
> thing.
> PV drivers can benefit from APIC in the kernel but again just a mere
> improvement.
>
>
>   
>> BTW, I see CPU utilization of qemu is almost always 99% in the top
>> command when I run kernel build in an x86-64 Linux guest.
>>     
>
>
> What does qemu do?
>
> Idle guest hardly consume cpu, especially KVM powered.
>   

qemu would be 99% even if all the time is being spent in the guest context.

If the user time is high, an oprofile run would be pretty useful.  I've 
found that the VGA drawing routines can be pretty expensive.

Regards,

Anthony Liguori

>> Jun
>> ---
>> Intel Open Source Technology Center
>>     
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                     ` <64F9B87B6B770947A9F8391472E032160B318ED4-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
@ 2007-04-04 16:48                                       ` Anthony Liguori
       [not found]                                         ` <4613D6CD.6060209-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  2007-04-05 13:50                                       ` Dong, Eddie
  1 sibling, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 16:48 UTC (permalink / raw)
  To: Dor Laor; +Cc: kvm-devel, Avi Kivity

Dor Laor wrote:
>>>> If we do this, then we can probably just handle the TPR as a special
>>>> case anyway and not bother returning to userspace when the TPR is
>>>> updated through MMIO.  That saves the round trip without adding
>>>> emulation complexity.
>>>>         
>>> That means the emulation is split among user space and kernel.  Not
>>> nice.  One of the advantages of moving the entire thing is that it is
>>> at least clearly defined.
>>>       
>> It still exists in userspace.  Having the code duplication (especially
>> when it's not the same code base) is unfortunate.  Plus, it complicates
>> save/restore/migration since now some device state is in the kernel.
>>     
> It
>   
>> further complicates things if you want to make sure that KVM saved
>> images are loadable in QEMU (you have to make sure that the device
>>     
> state
>   
>> is identical for the kernel and userspace).  Special casing the TPR
>> emulation seems like the lesser of two evils to me.
>>
>>     
>
> There are still some nasty issues that caused by running apic in qemu:
>  - If you have a PV driver that issued an irq and now it needs another
> one,
>    You cannot really be sure if the first one was injected. You can hold
>
>    extra state and follow irq injection from qemu but this is ugly.
>  - You have to keep sync of tpr and irq injection: suppose apic needs to
>
>    inject an irq, it pops it from the irr and tries to inject it. If it
> is 
>    done while the VM is in interrupt window closed or irq disables it
> will 
>    no happen instantly. When the irq would really be injected the tpr
> might 
>    be different, causing windows irq-not-less-or-equal BSOD.
>    This is especially true for injecting several irq at once.
>
> Keeping the apic in the kernel simplifies this with the cost of
> maintaining an apic/pic implementation. 
>   

Hrm, this is definitely starting to sound like a PITA to deal with.  
Maybe in-kernel platform devices are unavoidable :-/

> Do you know why Xen guy choose of implementing it in Xen? Why didn't
> they rip Qemu implementation?
>   

I believe it's based on the QEMU implementation although it's evolved 
quite a bit.  Jun can probably provide a better answer.

Regards,

Anthony Liguori

>   
>> Regards,
>>
>> Anthony Liguori
>>     
>
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <4613D30E.7030905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 16:48                                               ` Nakajima, Jun
       [not found]                                                 ` <8FFF7E42E93CC646B632AB40643802A8025B96AA-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Nakajima, Jun @ 2007-04-04 16:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Nakajima, Jun wrote:
>> Most of H/W-virtualization capable processors out there don't support
>> that feature today. I think the decision (kvm or qemu) should be done
>> based on performance data. I'm not worried about maintenance issues;
>> the APIC code is not expected to change frequently. I'm a bit
>> worried about extra complexity caused by such split, though.
>> 
>> 
> 
> In principle we could measure the performance cost today with the
> pv-net driver; however it still does a lot of copies which could be
> eliminated. 
> 
>> BTW, I see CPU utilization of qemu is almost always 99% in the top
>> command when I run kernel build in an x86-64 Linux guest.
>> 
> 
> Isn't that expected? if your guest image is mostly cached in the host,
> the guest would have nothing to block on.

I compared the performance on Xen and KVM for kernel build using the
same guest image. Looks like KVM was (kvm-17) three times slower as far
as we tested, and that high load of qemu was one of the symptoms. We are
looking at the shadow code, but the load of qemu looks very high. I
remember we had similar problems in Xen before, but those were fixed.
Someone should take a look at the qemu side.

Jun
---
Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                 ` <4613D596.7080201-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 16:54                                                   ` Avi Kivity
  0 siblings, 0 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:54 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>>  
>>> BTW, I see CPU utilization of qemu is almost always 99% in the top
>>> command when I run kernel build in an x86-64 Linux guest.
>>>     
>>
>
> qemu would be 99% even if all the time is being spent in the guest 
> context.
>
> If the user time is high, an oprofile run would be pretty useful.  
> I've found that the VGA drawing routines can be pretty expensive.
>

Ah, Jun's used to seeing Xen's qemu-dm idling when the guest is not 
issuing I/O.  With kvm, guest cpu time is accounted as system time to 
the qemu process.  Thanks for clearing that up, I didn't understand the 
question till now.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                 ` <8FFF7E42E93CC646B632AB40643802A8025B96AA-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-04 16:58                                                   ` Avi Kivity
       [not found]                                                     ` <4613D93A.5020902-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-04 18:12                                                   ` Anthony Liguori
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-04 16:58 UTC (permalink / raw)
  To: Nakajima, Jun; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Nakajima, Jun wrote:
> I compared the performance on Xen and KVM for kernel build using the
> same guest image. Looks like KVM was (kvm-17) three times slower as far
> as we tested, and that high load of qemu was one of the symptoms. We are
> looking at the shadow code, but the load of qemu looks very high. I
> remember we had similar problems in Xen before, but those were fixed.
> Someone should take a look at the qemu side.
>   

I'd expect the following issues to dominate:

- the shadow cache is quite small at 256 pages.  Increasing it may 
increase performance.

- we haven't yet taught the scheduler that migrating vcpus is expensive 
due to the IPI needed to fetch the vmcs.  Maybe running with 'taskset 1' 
would help

- shadow eviction policy is FIFO, not LRU, which probably causes many 
page faults.

Running kvm_stat can help show what's going on.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                     ` <4613D93A.5020902-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 17:09                                                       ` Nakajima, Jun
       [not found]                                                         ` <8FFF7E42E93CC646B632AB40643802A8025B970D-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Nakajima, Jun @ 2007-04-04 17:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Nakajima, Jun wrote:
>> I compared the performance on Xen and KVM for kernel build using the
>> same guest image. Looks like KVM was (kvm-17) three times slower as
>> far as we tested, and that high load of qemu was one of the
>> symptoms. We are looking at the shadow code, but the load of qemu
>> looks very high. I remember we had similar problems in Xen before,
>> but those were fixed. Someone should take a look at the qemu side.
>> 
> 
> I'd expect the following issues to dominate:
> 
> - the shadow cache is quite small at 256 pages.  Increasing it may
> increase performance.

Yes, we are aware of this. 

> 
> - we haven't yet taught the scheduler that migrating vcpus is
> expensive due to the IPI needed to fetch the vmcs.  Maybe running
> with 'taskset 1' would help
> 
> - shadow eviction policy is FIFO, not LRU, which probably causes many
> page faults.

This may explain that the performance gets worse as we repeat kernel
build, at least, the second run is slower than the first one.  

> 
> Running kvm_stat can help show what's going on.

Thanks for the good insights. We'll come back with some analysis.

Jun
---
Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]         ` <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
  2007-04-04 14:39           ` Avi Kivity
@ 2007-04-04 17:51           ` Gregory Haskins
       [not found]             ` <46139F39.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Gregory Haskins @ 2007-04-04 17:51 UTC (permalink / raw)
  To: Anthony Liguori, Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>>> On Wed, Apr 4, 2007 at 10:20 AM, in message <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>,
Anthony Liguori <anthony-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org> wrote: 
>
> The devices are already written to take a set_irq function.  Instead of 
> hijacking the emulated PIC device, I think it would be better if in 
> pc.c, we just conditionally created our PIC device that reflected to the 
> hypervisor and passed the appropriate function to the emulated hardware.
> 

When I first starting looking at the code, I was hoping to do exactly that.  But unfortunately it appears as though many devices call pic_set_irq() directly which is why I used the approach I did.  Take a look at qemu/hw/rt18139.c for an example. 


> Otherwise, to support all the other architectures, there's going to be a 
> lot of modifications.

Fully agree, which is why I waited for review before putting in all that work ;)

-Greg

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                 ` <8FFF7E42E93CC646B632AB40643802A8025B96AA-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2007-04-04 16:58                                                   ` Avi Kivity
@ 2007-04-04 18:12                                                   ` Anthony Liguori
  1 sibling, 0 replies; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 18:12 UTC (permalink / raw)
  To: Nakajima, Jun; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Nakajima, Jun wrote:
> Avi Kivity wrote:
>   
>> Nakajima, Jun wrote:
>>     
>>> Most of H/W-virtualization capable processors out there don't support
>>> that feature today. I think the decision (kvm or qemu) should be done
>>> based on performance data. I'm not worried about maintenance issues;
>>> the APIC code is not expected to change frequently. I'm a bit
>>> worried about extra complexity caused by such split, though.
>>>
>>>
>>>       
>> In principle we could measure the performance cost today with the
>> pv-net driver; however it still does a lot of copies which could be
>> eliminated. 
>>
>>     
>>> BTW, I see CPU utilization of qemu is almost always 99% in the top
>>> command when I run kernel build in an x86-64 Linux guest.
>>>
>>>       
>> Isn't that expected? if your guest image is mostly cached in the host,
>> the guest would have nothing to block on.
>>     
>
> I compared the performance on Xen and KVM for kernel build using the
> same guest image. Looks like KVM was (kvm-17) three times slower as far
> as we tested, and that high load of qemu was one of the symptoms. We are
> looking at the shadow code, but the load of qemu looks very high.

The KVM IDE emulation is much slower than Xen's.  Part of that is the 
new AIO framework.

It may be worth modifying the qemu_aio_init() function in block-raw.c to 
increase the AIO thread count, and trying with a SCSI disk.  The SCSI 
disk with > 1 AIO thread is pretty good.

Regards,

Anthony Liguori

>  I
> remember we had similar problems in Xen before, but those were fixed.
> Someone should take a look at the qemu side.
>
> Jun
> ---
> Intel Open Source Technology Center
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]             ` <46139F39.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-04-04 18:19               ` Anthony Liguori
  0 siblings, 0 replies; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 18:19 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Gregory Haskins wrote:
>>>> On Wed, Apr 4, 2007 at 10:20 AM, in message <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>,
>>>>         
> Anthony Liguori <anthony-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org> wrote: 
>   
>> The devices are already written to take a set_irq function.  Instead of 
>> hijacking the emulated PIC device, I think it would be better if in 
>> pc.c, we just conditionally created our PIC device that reflected to the 
>> hypervisor and passed the appropriate function to the emulated hardware.
>>
>>     
>
> When I first starting looking at the code, I was hoping to do exactly that.  But unfortunately it appears as though many devices call pic_set_irq() directly which is why I used the approach I did.  Take a look at qemu/hw/rt18139.c for an example. 
>   

Yeah, I was thinking of serial.c specifically.  I just looked again and 
things are not quite as clean as I had thought.

PPC has a set_irq hook but it also has it's own pic_set_irq function.

I suspect the cleanest thing to do would be to register an irq handler 
that takes an callback/opaque.  Probably switch pic_set_irq calls to 
qemu_set_irq or something.

But this is definitely a topic to bring up on qemu-devel as it touches 
all architectures.

Regards,

Anthony Liguori

>   
>> Otherwise, to support all the other architectures, there's going to be a 
>> lot of modifications.
>>     
>
> Fully agree, which is why I waited for review before putting in all that work ;)
>
> -Greg
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                     ` <4613D001.3040606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-04 20:01                                       ` Ingo Molnar
       [not found]                                         ` <20070404200112.GA6070-X9Un+BFzKDI@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Ingo Molnar @ 2007-04-04 20:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f


* Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:

> > It still exists in userspace.  Having the code duplication 
> > (especially when it's not the same code base) is unfortunate.
> 
> This remains true.

but it's the wrong argument. Of course there's duplicate functionality, 
and that's _good_ because it represents choice. KVM _itself_ is 
duplicate functionality of qemu in a way. So why move the lapic/PIC 
handling to the kernel? Because it's alot cleaner to do device emulation 
there and PV drivers get significantly easier to do. The lapic/PIC code 
should also be available in Qemu for OSs that dont have KVM-alike 
support in the kernel.

and while today most of the performance advantages of moving the PIC 
into the kernel are masked by the high cost of VM exits, in the future 
the effect will be more marked, as the relative cost of piggybacking out 
to qemu increases.

I can see the value in doing certain things in Qemu, but i cannot see 
_at all_ the value of handling say the PIT in Qemu. Just look at the 
Qemu PIT/timers code quality in Qemu for a change ... it's a huge ugly 
mess of lots of #ifdefs, ineffective handling of /dev/rtc, linear list 
walking, signal overhead, etc., etc. All of that resulting in 10-15% of 
'idle' overhead of KVM+qemu when it runs a Linux guest. On the other 
side, in the kernel it's most natural to do timers and to emulate 
hardware, because the kernel has _precise_ knowledge about the 
platform's capabilities.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <4613D6CD.6060209-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
@ 2007-04-04 20:07                                           ` Ingo Molnar
  0 siblings, 0 replies; 81+ messages in thread
From: Ingo Molnar @ 2007-04-04 20:07 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Avi Kivity


* Anthony Liguori <anthony-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org> wrote:

> > Keeping the apic in the kernel simplifies this with the cost of 
> > maintaining an apic/pic implementation.
> 
> Hrm, this is definitely starting to sound like a PITA to deal with.  
> Maybe in-kernel platform devices are unavoidable :-/

yes, very much so. Not only are they unavoidable, they largely simplify 
many aspects of KVM. Did anyone here ever have unreliable keyboard 
emulation in qemu because /dev/rtc didnt allow 1024 Hz? Such basic 
issues are just not present when timer emulation is done by the kernel. 
The kernel abstracts away the actual hardware, and Qemu then uses this 
abstract interfaces to create something that is specific. By moving 
platform device emulation into KVM, much of that (unnecessary) 
indirection goes away. Furthermore, scheduling and guest-interrupt 
handling is something that is most naturally done in the host kernel - 
any indirection is unnecessary fat that distracts from the core purpose 
of building a first-class hypervisor subsystem.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <20070404200112.GA6070-X9Un+BFzKDI@public.gmane.org>
@ 2007-04-04 20:24                                           ` Anthony Liguori
       [not found]                                             ` <4614098F.2030307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-04-05  6:40                                           ` Avi Kivity
  1 sibling, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-04 20:24 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Ingo Molnar wrote:
> * Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
>
>   
>>> It still exists in userspace.  Having the code duplication 
>>> (especially when it's not the same code base) is unfortunate.
>>>       
>> This remains true.
>>     
>
> but it's the wrong argument. Of course there's duplicate functionality, 
> and that's _good_ because it represents choice. KVM _itself_ is 
> duplicate functionality of qemu in a way. So why move the lapic/PIC 
> handling to the kernel? Because it's alot cleaner to do device emulation 
> there and PV drivers get significantly easier to do.

But why is it a good thing to do PV drivers in the kernel?  You lose 
flexibility and functionality to gain performance.  Really, it's more 
about there not being good enough userspace interfaces to do network IO.

>  The lapic/PIC code 
> should also be available in Qemu for OSs that dont have KVM-alike 
> support in the kernel.
>
> and while today most of the performance advantages of moving the PIC 
> into the kernel are masked by the high cost of VM exits, in the future 
> the effect will be more marked, as the relative cost of piggybacking out 
> to qemu increases.
>
> I can see the value in doing certain things in Qemu, but i cannot see 
> _at all_ the value of handling say the PIT in Qemu. Just look at the 
> Qemu PIT/timers code quality in Qemu for a change ... it's a huge ugly 
> mess of lots of #ifdefs, ineffective handling of /dev/rtc, linear list 
> walking, signal overhead, etc., etc. All of that resulting in 10-15% of 
> 'idle' overhead of KVM+qemu when it runs a Linux guest. On the other 
> side, in the kernel it's most natural to do timers and to emulate 
> hardware, because the kernel has _precise_ knowledge about the 
> platform's capabilities.
>   

Yeah, I think this is a good point.  If we're going to push the APIC 
into the kernel, we might as well put the PIT there too.  The timing 
stuff is an absolute mess in QEMU since it wants to get a fast high res 
clock but isn't aware of things like CPU migration.  I'm pretty sure 
that if you are on an SMP host, some bad things can happen with the QEMU 
timer code since it relies on the rdtsc.

Regards,

Anthony Liguori

> 	Ingo
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <4614098F.2030307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-04-04 21:21                                               ` Ingo Molnar
  2007-04-04 23:19                                                 ` [kvm-devel] " Rusty Russell
  2007-04-04 22:07                                               ` Dor Laor
  2007-04-05  6:54                                               ` Avi Kivity
  2 siblings, 1 reply; 81+ messages in thread
From: Ingo Molnar @ 2007-04-04 21:21 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f


* Anthony Liguori <aliguori-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:

> But why is it a good thing to do PV drivers in the kernel?  You lose 
> flexibility and functionality to gain performance. [...]

in Linux a kernel-space network driver can still be tunneled over 
user-space code, and hence you can add arbitrary add-on functionality 
(and thus have flexibility), without slowing down the common case (which 
would be to tunnel the guest's network traffic into the firewall rules 
of the kernel. No need to touch user-space for any of that).

if performance didnt matter and if it were all about flexibility then 
Bochs/Qemu could have migrated most Windows users to Linux 10 years ago. 
The reality is, performance and precision of emulation very much 
matters.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                         ` <8FFF7E42E93CC646B632AB40643802A8025B970D-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-04 21:33                                                           ` Dor Laor
  0 siblings, 0 replies; 81+ messages in thread
From: Dor Laor @ 2007-04-04 21:33 UTC (permalink / raw)
  To: Nakajima, Jun, Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>Avi Kivity wrote:
>> Nakajima, Jun wrote:
>>> I compared the performance on Xen and KVM for kernel build using the
>>> same guest image. Looks like KVM was (kvm-17) three times slower as
>>> far as we tested, and that high load of qemu was one of the
>>> symptoms. We are looking at the shadow code, but the load of qemu
>>> looks very high. I remember we had similar problems in Xen before,
>>> but those were fixed. Someone should take a look at the qemu side.
>>>
>>
>> I'd expect the following issues to dominate:
>>
>> - the shadow cache is quite small at 256 pages.  Increasing it may
>> increase performance.
>
>Yes, we are aware of this.
>
>>
>> - we haven't yet taught the scheduler that migrating vcpus is
>> expensive due to the IPI needed to fetch the vmcs.  Maybe running
>> with 'taskset 1' would help
>>
>> - shadow eviction policy is FIFO, not LRU, which probably causes many
>> page faults.
>
>This may explain that the performance gets worse as we repeat kernel
>build, at least, the second run is slower than the first one.
>
>>
>> Running kvm_stat can help show what's going on.
>
>Thanks for the good insights. We'll come back with some analysis.

Maybe you can come up with a patch ;)
If you touch those areas anyway and run several benchmarks, writing a
small piece of code is neglectable effort.

>
>Jun
>---
>Intel Open Source Technology Center

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <4614098F.2030307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-04-04 21:21                                               ` Ingo Molnar
@ 2007-04-04 22:07                                               ` Dor Laor
  2007-04-05  6:54                                               ` Avi Kivity
  2 siblings, 0 replies; 81+ messages in thread
From: Dor Laor @ 2007-04-04 22:07 UTC (permalink / raw)
  To: Anthony Liguori, Ingo Molnar; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>But why is it a good thing to do PV drivers in the kernel?  You lose
>flexibility and functionality to gain performance.  Really, it's more
>about there not being good enough userspace interfaces to do network
IO.
>
>>  The lapic/PIC code
>> should also be available in Qemu for OSs that dont have KVM-alike
>> support in the kernel.
>>
>> and while today most of the performance advantages of moving the PIC
>> into the kernel are masked by the high cost of VM exits, in the
future
>> the effect will be more marked, as the relative cost of piggybacking
out
>> to qemu increases.
>>
>> I can see the value in doing certain things in Qemu, but i cannot see
>> _at all_ the value of handling say the PIT in Qemu. Just look at the
>> Qemu PIT/timers code quality in Qemu for a change ... it's a huge
ugly
>> mess of lots of #ifdefs, ineffective handling of /dev/rtc, linear
list
>> walking, signal overhead, etc., etc. All of that resulting in 10-15%
of
>> 'idle' overhead of KVM+qemu when it runs a Linux guest. On the other
>> side, in the kernel it's most natural to do timers and to emulate
>> hardware, because the kernel has _precise_ knowledge about the
>> platform's capabilities.

I think that the pit just need clean, ifdef-less implementation, I can't
believe it adds a bug performance penalty. Actually it is quite simple
to implement dyn-tick in qemu.
The question is after moving the pic,apic,ioapic into the kernel, is it
nicer to re-implement the pit in the kernel or in qemu. I suggest we
start with the first 3 nominees...

>>
>
>Yeah, I think this is a good point.  If we're going to push the APIC
>into the kernel, we might as well put the PIT there too.  The timing
>stuff is an absolute mess in QEMU since it wants to get a fast high res
>clock but isn't aware of things like CPU migration.  I'm pretty sure
>that if you are on an SMP host, some bad things can happen with the
QEMU
>timer code since it relies on the rdtsc.

Btw: KVM support guest rdtsc over physical cpus, I guess it wasn't
enough and the qemu pit side need to do it too.

>
>Regards,
>
>Anthony Liguori
>
>> 	Ingo
>>
>>
>
>
>-----------------------------------------------------------------------
--
>Take Surveys. Earn Cash. Influence the Future of IT
>Join SourceForge.net's Techsay panel and you'll get the chance to share
>your
>opinions on IT & business topics through brief surveys-and earn cash
>http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVD
EV
>_______________________________________________
>kvm-devel mailing list
>kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>https://lists.sourceforge.net/lists/listinfo/kvm-devel

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-04 21:21                                               ` Ingo Molnar
@ 2007-04-04 23:19                                                 ` Rusty Russell
  2007-04-05  7:17                                                   ` Avi Kivity
       [not found]                                                   ` <1175728768.12230.593.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 2 replies; 81+ messages in thread
From: Rusty Russell @ 2007-04-04 23:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Anthony Liguori, kvm-devel, netdev

On Wed, 2007-04-04 at 23:21 +0200, Ingo Molnar wrote:
> * Anthony Liguori <aliguori@us.ibm.com> wrote:
> 
> > But why is it a good thing to do PV drivers in the kernel?  You lose 
> > flexibility and functionality to gain performance. [...]
> 
> in Linux a kernel-space network driver can still be tunneled over 
> user-space code, and hence you can add arbitrary add-on functionality 
> (and thus have flexibility), without slowing down the common case (which 
> would be to tunnel the guest's network traffic into the firewall rules 
> of the kernel. No need to touch user-space for any of that).

You didn't quote Anthony's point about "it's more about there not being
good enough userspace interfaces to do network IO."

It's easier to write a kernel-space network driver, but it's not
obviously the right thing to do until we can show that an efficient
packet-level userspace interface isn't possible.  I don't think that's
been done, and it would be interesting to try.

Cheers,
Rusty.




^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <20070404200112.GA6070-X9Un+BFzKDI@public.gmane.org>
  2007-04-04 20:24                                           ` Anthony Liguori
@ 2007-04-05  6:40                                           ` Avi Kivity
  1 sibling, 0 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-05  6:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Ingo Molnar wrote:
> * Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
>
>   
>>> It still exists in userspace.  Having the code duplication 
>>> (especially when it's not the same code base) is unfortunate.
>>>       
>> This remains true.
>>     
>
> but it's the wrong argument. Of course there's duplicate functionality, 
> and that's _good_ because it represents choice. KVM _itself_ is 
> duplicate functionality of qemu in a way. So why move the lapic/PIC 
> handling to the kernel? Because it's alot cleaner to do device emulation 
> there and PV drivers get significantly easier to do. The lapic/PIC code 
> should also be available in Qemu for OSs that dont have KVM-alike 
> support in the kernel.
>   

Duplicating code is never good, and duplicating code into the kernel 
(where maintenance cost is much higher) is bad.  There has to be a very 
good reason for it.  For the core kvm code, it is the 10x or more 
performance increase over qemu.  If we are to add *pic/pit to kvm, we 
need to find an advantage that offsets the disadvantages.  This can be a 
combination of simpler interfaces and better performance.

> and while today most of the performance advantages of moving the PIC 
> into the kernel are masked by the high cost of VM exits, in the future 
> the effect will be more marked, as the relative cost of piggybacking out 
> to qemu increases.
>   

That is correct, but we need to quantify it.  Assuming 3us per qemu exit 
overhead, 30,000 events per second per core give us a 10% overall 
overhead.  That's >100K events/sec for an entry-level server.

> I can see the value in doing certain things in Qemu, but i cannot see 
> _at all_ the value of handling say the PIT in Qemu. Just look at the 
> Qemu PIT/timers code quality in Qemu for a change ... it's a huge ugly 
> mess of lots of #ifdefs, ineffective handling of /dev/rtc, linear list 
> walking, signal overhead, etc., etc. 

Bad userspace code should be fixed, not rewritten as kernel code.

> All of that resulting in 10-15% of 
> 'idle' overhead of KVM+qemu when it runs a Linux guest. 

On my machines it's 0% overhead on idle (runlevel 3).


> On the other 
> side, in the kernel it's most natural to do timers and to emulate 
> hardware, because the kernel has _precise_ knowledge about the 
> platform's capabilities.
>   

That comes back to the kernel not exporting proper interfaces.  I think 
that's fixed now, with hrtimers tied to the userspace APIs?

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <4614098F.2030307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-04-04 21:21                                               ` Ingo Molnar
  2007-04-04 22:07                                               ` Dor Laor
@ 2007-04-05  6:54                                               ` Avi Kivity
  2 siblings, 0 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-05  6:54 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>
> Yeah, I think this is a good point.  If we're going to push the APIC 
> into the kernel, we might as well put the PIT there too.  The timing 
> stuff is an absolute mess in QEMU since it wants to get a fast high 
> res clock but isn't aware of things like CPU migration.

qemu timing could indeed use an overhaul, for example not relying on a 
perioding tick but instead calculating the next wakeup event (for which 
it has all the infrastructure already; just unused).


> I'm pretty sure that if you are on an SMP host, some bad things can 
> happen with the QEMU timer code since it relies on the rdtsc.

It seems to compensate for it.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-04 23:19                                                 ` [kvm-devel] " Rusty Russell
@ 2007-04-05  7:17                                                   ` Avi Kivity
  2007-04-06  1:02                                                     ` Rusty Russell
       [not found]                                                   ` <1175728768.12230.593.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-05  7:17 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, netdev

Rusty Russell wrote:
> You didn't quote Anthony's point about "it's more about there not being
> good enough userspace interfaces to do network IO."
>
> It's easier to write a kernel-space network driver, but it's not
> obviously the right thing to do until we can show that an efficient
> packet-level userspace interface isn't possible.  I don't think that's
> been done, and it would be interesting to try.
>   

In the case of networking, the copyful interfaces on receive are driven 
by the hardware not knowing how to split the header from the data.  On 
transmit I agree, it could be made copyless from userspace (somthing 
like sendfilev, only not file oriented).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                   ` <1175728768.12230.593.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-05  9:30                                                     ` Ingo Molnar
       [not found]                                                       ` <20070405093033.GC25448-X9Un+BFzKDI@public.gmane.org>
  2007-04-05 14:32                                                       ` [kvm-devel] " Anthony Liguori
  0 siblings, 2 replies; 81+ messages in thread
From: Ingo Molnar @ 2007-04-05  9:30 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev


* Rusty Russell <rusty-8n+1lVoiYb80n/F98K4Iww@public.gmane.org> wrote:

> It's easier to write a kernel-space network driver, but it's not 
> obviously the right thing to do until we can show that an efficient 
> packet-level userspace interface isn't possible.  I don't think that's 
> been done, and it would be interesting to try.

yes, i agree in theory, but IMO this is largely beside the point. What 
matters most for developing a project is _the quality of the codebase_. 
That attracts developers, developers improve the code, which then 
attracts users, which attracts more developers, etc., etc. As long as 
the quality of the codebase is maintained, this is a self-sustaining 
process. You've seen that happen with Linux. [ And of course, the 
crutial step #0 is: a sane, open-minded maintainer with good taste ;-) ]

qemu's code quality is not really suitable for that basic OSS model, in 
my opinion. It has been a mostly one-man show for a long time with 
various hostile forks, bin-only kernel module and other actions that 
easily poison an OSS project.

the result is not surprising: important portions of qemu have grown into 
a hard to hack, hard to maintain codebase with poor code quality, with 
gems like:

 #ifdef _WIN32
 void CALLBACK host_alarm_handler(UINT uTimerID, UINT uMsg,
                                  DWORD_PTR dwUser, DWORD_PTR dw1, DWORD_PTR dw2)
 #else
 static void host_alarm_handler(int host_signum)
 #endif
 {
 #if 0
 #define DISP_FREQ 1000

and that's not just some random driver - this is _the_ main central 
timer code of qemu.

so right now the only option for a clean codebase is the KVM in-kernel 
code. It's clean and sweet and integrates nicely into the rest of the 
kernel. The kernel is also obviously the final place where most 
virtualization technologies want to show up because it's the entity that 
is the closest to the guest context: we _dont_ want to _force_ network 
traffic (let alone interrupt handling) through a userspace context, only 
if the functionality of the task absolutely requires it. (but in most 
cases we'll try to come up with a maximally flexible scheme that can 
just drive things straight via the kernel. netfilter/iptables isnt in 
user-space either, partly for that reason.)

but architectural issues aside (ignoring that the kernel _is_ the best 
place to do this particular of stuff), this question is still mainly 
dominated by the basic question of code quality. I'd rather move 
something into the Linux kernel, enforce its code quality that way, and 
_then_ add whatever clean infrastructure is needed to push it back into 
user-space again (into a different codebase), than having to hack the 
monolithic 200 KLOC+ qemu codebase that is shackled with support for 
tons of arcane architectures nobody uses and tons of arcane OS variants 
that no-one cares about. Now qemu is a very important enabler and 
platform-reference-implementation for KVM to fall back to, but it's not 
the place to put crutial new code into, at least currently.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                       ` <20070405093033.GC25448-X9Un+BFzKDI@public.gmane.org>
@ 2007-04-05  9:58                                                         ` Avi Kivity
  2007-04-05 10:26                                                           ` [kvm-devel] " Ingo Molnar
  2007-04-05 10:55                                                         ` Ingo Molnar
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-05  9:58 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Ingo Molnar wrote:
> so right now the only option for a clean codebase is the KVM in-kernel 
> code.

I strongly disagree with this.  Bad code in userspace is not an excuse 
for shoving stuff into the kernel, where maintaining it is much more 
expensive, and the cause of a mistake can be system crashes and data 
loss, affecting unrelated processes.  If we move something into the 
kernel, we'd better have a really good reason for it.

Qemu code _is_ crufty.  We can do one of three things:
1. live with it
2. fork it and clean it up
3. clean it up incrementally and merge it upstream

Currently we're doing (1).  You're suggesting a variant of (2), fork 
plus move into the kernel.  The right thing to do IMO is (3), but I 
don't see anybody volunteering.  Qemu picked up additional committers 
recently and I believe they would be receptive to cleanups.

[In the *pic/pit case, we have other reasons to push things into the 
kernel.  But "this code is crap, let's rewrite it in the kernel" is not 
a justification I'll accept.  I'd be much happier if we could quantify 
these other reasons.]


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05  9:58                                                         ` Avi Kivity
@ 2007-04-05 10:26                                                           ` Ingo Molnar
  2007-04-05 11:26                                                             ` Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Ingo Molnar @ 2007-04-05 10:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Rusty Russell, kvm-devel, netdev


* Avi Kivity <avi@qumranet.com> wrote:

> > so right now the only option for a clean codebase is the KVM 
> > in-kernel code.
> 
> I strongly disagree with this.

are you disagreeing with my statement that the KVM kernel-side code is 
the only clean codebase here? To me this is a clear fact :)

I only pointed out that the only clean codebase at the moment is the KVM 
in-kernel code - i did not make the argument (at all) that every new 
piece of KVM code should be done in the kernel. That would be stupid - 
do you think i'd advocate for example moving command line argument 
parsing into the kernel?

and as i said in the mail: "the kernel _is_ the best place to do this 
particular stuff".

	Ingo

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                       ` <20070405093033.GC25448-X9Un+BFzKDI@public.gmane.org>
  2007-04-05  9:58                                                         ` Avi Kivity
@ 2007-04-05 10:55                                                         ` Ingo Molnar
  1 sibling, 0 replies; 81+ messages in thread
From: Ingo Molnar @ 2007-04-05 10:55 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev


* Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org> wrote:

> * Rusty Russell <rusty-8n+1lVoiYb80n/F98K4Iww@public.gmane.org> wrote:
> 
> > It's easier to write a kernel-space network driver, but it's not 
> > obviously the right thing to do until we can show that an efficient 
> > packet-level userspace interface isn't possible.  I don't think 
> > that's been done, and it would be interesting to try.
> 
> yes, i agree in theory, [...]

let me explain my position a bit more verbosely:

i agree in terms of 'network driver' (and more generally in terms of 
'device', which includes network, storage, console, etc. devices): 
having a user-space driver option should still be possible and it should 
be integrated well. Qemu is quite rich and flexible in these areas and 
we dont want to throw away or isolate that body of code.

but i dont agree in terms of PIC code, which is the main argument in 
this particular thread. There's little precedent for any add-ons for 
PICs in user-space, nor any particular PIC handling richness in Qemu 
that we'd like to preserve.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05 10:26                                                           ` [kvm-devel] " Ingo Molnar
@ 2007-04-05 11:26                                                             ` Avi Kivity
       [not found]                                                               ` <4614DCE1.70905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-05 11:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Rusty Russell, kvm-devel, netdev

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>>> so right now the only option for a clean codebase is the KVM 
>>> in-kernel code.
>>>       
>> I strongly disagree with this.
>>     
>
> are you disagreeing with my statement that the KVM kernel-side code is 
> the only clean codebase here? To me this is a clear fact :)
>   

No, I agree with that.  I just disagree with choosing to put the *pic 
code (or other code) into the kernel on *that* basis.  The selection 
should be on design/performance issues alone, *not* the state of 
existing code.

> I only pointed out that the only clean codebase at the moment is the KVM 
> in-kernel code - i did not make the argument (at all) that every new 
> piece of KVM code should be done in the kernel. That would be stupid - 
> do you think i'd advocate for example moving command line argument 
> parsing into the kernel?
>   

No.  But the difference in cruftiness between kvm and qemu code should 
not enter into the discussion of where to do things.

> and as i said in the mail: "the kernel _is_ the best place to do this 
> particular stuff".
>   

I agree with this, maybe for different reasons.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                               ` <4614DCE1.70905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-05 11:36                                                                 ` Ingo Molnar
  2007-04-06  1:16                                                                   ` [kvm-devel] " Rusty Russell
  0 siblings, 1 reply; 81+ messages in thread
From: Ingo Molnar @ 2007-04-05 11:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev


* Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:

> [...] But the difference in cruftiness between kvm and qemu code 
> should not enter into the discussion of where to do things.

i agree that it doesnt enter the discussion for the *PIC question, but 
it very much enters the discussion for the question that i replied to:

> > > You didn't quote Anthony's point about "it's more about there not 
> > > being good enough userspace interfaces to do network IO."
> > > 
> > > It's easier to write a kernel-space network driver, but it's not
> > >  obviously the right thing to do until we can show that an 
> > > efficient packet-level userspace interface isn't possible.  I 
> > > don't think that's been done, and it would be interesting to try.

prototyping new kernel APIs to implement user-space network drivers, on 
a crufty codebase is not something that should be done lightly. Any 
negative result will not bring us any real conclusion. (was the failure 
due to the concept, due the API or due to the crufty codebase?)

(but ... this is really a side-track issue for the *PIC question at 
hand. PICs are not network devices, they are essential platform 
components and almost an extended part of the CPU.)

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                     ` <64F9B87B6B770947A9F8391472E032160B318ED4-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  2007-04-04 16:48                                       ` Anthony Liguori
@ 2007-04-05 13:50                                       ` Dong, Eddie
       [not found]                                         ` <0E6FE5D295DE5B4B8D9070C26A227987E0D1EC-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Dong, Eddie @ 2007-04-05 13:50 UTC (permalink / raw)
  To: Dor Laor, Anthony Liguori, Avi Kivity
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dor Laor wrote:
> 
> Do you know why Xen guy choose of implementing it in Xen? Why didn't
> they rip Qemu implementation?
> 
With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux 
kernel, KB gets 14% gain. We also did a shared PIC model which share
 PIC state among Qemu & VMM with less LOC in VMM, it can get 
similar performance gain (5.8% in my test).
BTW, at that time, PIT is in VMM already.
Eddie

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <0E6FE5D295DE5B4B8D9070C26A227987E0D1EC-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-05 13:59                                           ` Avi Kivity
       [not found]                                             ` <461500B5.4080102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-05 14:41                                           ` Dor Laor
  1 sibling, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-05 13:59 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dong, Eddie wrote:
> Dor Laor wrote:
>   
>> Do you know why Xen guy choose of implementing it in Xen? Why didn't
>> they rip Qemu implementation?
>>
>>     
> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux 
> kernel, KB gets 14% gain. We also did a shared PIC model which share
>  PIC state among Qemu & VMM with less LOC in VMM, it can get 
> similar performance gain (5.8% in my test).
> BTW, at that time, PIT is in VMM already.
>   

I expect that the gain in kvm will be smaller.  Xen has to schedule dom0 
to process the event channel (possibly on another cpu), dom0 has to 
schedule qemu-dm (again, possibly on another cpu), qemu does its thing, 
and then Xen has to schedule domU again.  With kvm, we are always on the 
same cpu, and the only overhead is the system call, which is a few 
hundred nanoseconds.  I expect with current hardware that it will be 
negligible (as a vmexit is measured in microseconds), but to become 
measurable as hardware improves.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05  9:30                                                     ` Ingo Molnar
       [not found]                                                       ` <20070405093033.GC25448-X9Un+BFzKDI@public.gmane.org>
@ 2007-04-05 14:32                                                       ` Anthony Liguori
  2007-04-06 10:37                                                         ` Ingo Molnar
  1 sibling, 1 reply; 81+ messages in thread
From: Anthony Liguori @ 2007-04-05 14:32 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Rusty Russell, kvm-devel, netdev

Ingo Molnar wrote:
> * Rusty Russell <rusty@rustcorp.com.au> wrote:
>
>   
>> It's easier to write a kernel-space network driver, but it's not 
>> obviously the right thing to do until we can show that an efficient 
>> packet-level userspace interface isn't possible.  I don't think that's 
>> been done, and it would be interesting to try.
>>     
>
> yes, i agree in theory, but IMO this is largely beside the point. What 
> matters most for developing a project is _the quality of the codebase_. 
> That attracts developers, developers improve the code, which then 
> attracts users, which attracts more developers, etc., etc. As long as 
> the quality of the codebase is maintained, this is a self-sustaining 
> process. You've seen that happen with Linux. [ And of course, the 
> crutial step #0 is: a sane, open-minded maintainer with good taste ;-) ]
>
> qemu's code quality is not really suitable for that basic OSS model, in 
> my opinion.

I think you may want to step off your high horse there.  QEMU's code may 
not be Linux kernel quality but it's certainly not anywhere near the 
worst that is out there.  Linux is over decade old.  QEMU is only around 
3 years old.  Did Linux have extremely high quality code in 1994?  
Instead of posting code snippets to LKML, it would be much more 
constructive to post patches to qemu-devel.  It's not like the QEMU 
maintainers are actively ignoring your efforts to improve the code.

> but architectural issues aside (ignoring that the kernel _is_ the best 
> place to do this particular of stuff),

Right.  We don't put things in the kernel just because we don't like the 
way the userspace code is written.  If that logic was valid, then Linus 
would be working on moving all of Gnome into the kernel.

This discussion has two parts.  The first is whether or not the kernel 
is the right place for a paravirtual network driver backend.  My current 
believe is that we could not get enough performance from something like 
tun to do it in userspace.  I also believe that we could improve tun (or 
create a replacement) so that we could implement a PV network driver 
backend in userspace.  Admittedly, I'm not an expert in networking 
though so I could be wrong here.

The second part is whether the platform devices should go in the 
kernel.  I agree with you that having the PIT in the kernel is probably 
a good idea.  I also agree that we probably have no choice but to move 
the APIC into the kernel (not for PV drivers, but for TPR performance 
and SMP support).

Regards,

Anthony Liguori

>  this question is still mainly 
> dominated by the basic question of code quality. I'd rather move 
> something into the Linux kernel, enforce its code quality that way, and 
> _then_ add whatever clean infrastructure is needed to push it back into 
> user-space again (into a different codebase), than having to hack the 
> monolithic 200 KLOC+ qemu codebase that is shackled with support for 
> tons of arcane architectures nobody uses and tons of arcane OS variants 
> that no-one cares about. Now qemu is a very important enabler and 
> platform-reference-implementation for KVM to fall back to, but it's not 
> the place to put crutial new code into, at least currently.
>
> 	Ingo
>   


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                         ` <0E6FE5D295DE5B4B8D9070C26A227987E0D1EC-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2007-04-05 13:59                                           ` Avi Kivity
@ 2007-04-05 14:41                                           ` Dor Laor
       [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B319509-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Dor Laor @ 2007-04-05 14:41 UTC (permalink / raw)
  To: Dong, Eddie, Anthony Liguori, Avi Kivity
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f


>Dor Laor wrote:
>>
>> Do you know why Xen guy choose of implementing it in Xen? Why didn't
>> they rip Qemu implementation?
>>
>With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux
>kernel, KB gets 14% gain. We also did a shared PIC model which share
> PIC state among Qemu & VMM with less LOC in VMM, it can get
>similar performance gain (5.8% in my test).
>BTW, at that time, PIT is in VMM already.
>Eddie

Thanks for the info, I suspected most chances the performance gain is
small.
So for the sake of the next arguments, what was the fuel running the
decision of moving all the X-PI[c|t] things into the Xen?

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B319509-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
@ 2007-04-05 14:48                                               ` Dong, Eddie
  2007-04-05 14:57                                               ` Dong, Eddie
  1 sibling, 0 replies; 81+ messages in thread
From: Dong, Eddie @ 2007-04-05 14:48 UTC (permalink / raw)
  To: Dor Laor, Anthony Liguori, Avi Kivity
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dor Laor wrote:
>> Dor Laor wrote:
>>> 
>>> Do you know why Xen guy choose of implementing it in Xen? Why
>>> didn't they rip Qemu implementation? 
>>> 
>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux
>> kernel, KB gets 14% gain. We also did a shared PIC model which share
>> PIC state among Qemu & VMM with less LOC in VMM, it can get
>> similar performance gain (5.8% in my test).
>> BTW, at that time, PIT is in VMM already.
>> Eddie
> 
> Thanks for the info, I suspected most chances the performance gain is
> small.
> So for the sake of the next arguments, what was the fuel running the
> decision of moving all the X-PI[c|t] things into the Xen?

Performance.
Thx, Eddie

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <461500B5.4080102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-05 14:52                                               ` Dong, Eddie
       [not found]                                                 ` <0E6FE5D295DE5B4B8D9070C26A227987F2444B-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Dong, Eddie @ 2007-04-05 14:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux
>> kernel, KB gets 14% gain. We also did a shared PIC model which share
>>  PIC state among Qemu & VMM with less LOC in VMM, it can get
>> similar performance gain (5.8% in my test).
>> BTW, at that time, PIT is in VMM already.
>> 
> 
> I expect that the gain in kvm will be smaller.  Xen has to schedule
> dom0 to process the event channel (possibly on another cpu), dom0 has
> to schedule qemu-dm (again, possibly on another cpu), qemu does its
> thing, and then Xen has to schedule domU again.  With kvm, we are
> always on the same cpu, and the only overhead is the system call,
> which is a few hundred nanoseconds.  I expect with current hardware
> that it will be negligible (as a vmexit is measured in microseconds),
> but to become measurable as hardware improves.
Yes very possible.
We can take a quick mesurement to see how many cycles are spent in a
dummy
I/O emulation in KVM/Qemu. In Xen, one of my old P4 3.8GHZ platform
takes 
about 50-60K cycles. We can see how many is it in KVM.
BTW, today Linux kernel is no longer 1000HZ :-)
thx,eddie

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B319509-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
  2007-04-05 14:48                                               ` Dong, Eddie
@ 2007-04-05 14:57                                               ` Dong, Eddie
  1 sibling, 0 replies; 81+ messages in thread
From: Dong, Eddie @ 2007-04-05 14:57 UTC (permalink / raw)
  To: Dor Laor, Anthony Liguori, Avi Kivity
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dor Laor wrote:
> So for the sake of the next arguments, what was the fuel running the
> decision of moving all the X-PI[c|t] things into the Xen?
PIT is in Xen at very beginning when HVM support is designed due to
performance 
reason. Moving PIC to VMM is also for performance reason at that time 
(SMP is not there yet), also potential for SMP too.
APIC was enabled 2 months later after PIC and it was pretty nature to be
in 
VMM since PIC was already in VMM at that time :-)
Thanks, eddie

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                 ` <0E6FE5D295DE5B4B8D9070C26A227987F2444B-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-05 15:01                                                   ` Avi Kivity
       [not found]                                                     ` <46150F4F.4030505-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-05 15:01 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dong, Eddie wrote:
> Avi Kivity wrote:
>   
>>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux
>>> kernel, KB gets 14% gain. We also did a shared PIC model which share
>>>  PIC state among Qemu & VMM with less LOC in VMM, it can get
>>> similar performance gain (5.8% in my test).
>>> BTW, at that time, PIT is in VMM already.
>>>
>>>       
>> I expect that the gain in kvm will be smaller.  Xen has to schedule
>> dom0 to process the event channel (possibly on another cpu), dom0 has
>> to schedule qemu-dm (again, possibly on another cpu), qemu does its
>> thing, and then Xen has to schedule domU again.  With kvm, we are
>> always on the same cpu, and the only overhead is the system call,
>> which is a few hundred nanoseconds.  I expect with current hardware
>> that it will be negligible (as a vmexit is measured in microseconds),
>> but to become measurable as hardware improves.
>>     
> Yes very possible.
> We can take a quick mesurement to see how many cycles are spent in a
> dummy
> I/O emulation in KVM/Qemu. In Xen, one of my old P4 3.8GHZ platform
> takes 
> about 50-60K cycles. We can see how many is it in KVM.
> BTW, today Linux kernel is no longer 1000HZ :-)
> thx,eddie
>   

There's some (old) data here:

  http://virt.kernelnewbies.org/KVM/Performance

showing pio latency of ~5600 cycles.  Note that this is on AMD, which 
takes less cycles to switch than the P4, but on the other hand, we still 
do a save/restore of the fpu state on every exit, so we can speed it up 
even more.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                     ` <46150F4F.4030505-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-05 15:16                                                       ` Dong, Eddie
       [not found]                                                         ` <0E6FE5D295DE5B4B8D9070C26A227987F24452-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2007-04-05 15:22                                                       ` Anthony Liguori
  1 sibling, 1 reply; 81+ messages in thread
From: Dong, Eddie @ 2007-04-05 15:16 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Dong, Eddie wrote:
>> Avi Kivity wrote:
>> 
>>>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ
>>>> linux kernel, KB gets 14% gain. We also did a shared PIC model
>>>>  which share PIC state among Qemu & VMM with less LOC in VMM, it
>>>> can get 
>>>> similar performance gain (5.8% in my test).
>>>> BTW, at that time, PIT is in VMM already.
>>>> 
>>>> 
>>> I expect that the gain in kvm will be smaller.  Xen has to schedule
>>> dom0 to process the event channel (possibly on another cpu), dom0
>>> has to schedule qemu-dm (again, possibly on another cpu), qemu does
>>> its thing, and then Xen has to schedule domU again.  With kvm, we
>>> are always on the same cpu, and the only overhead is the system
>>> call, which is a few hundred nanoseconds.  I expect with current
>>> hardware that it will be negligible (as a vmexit is measured in
>>> microseconds), but to become measurable as hardware improves.
>>> 
>> Yes very possible.
>> We can take a quick mesurement to see how many cycles are spent in a
>> dummy I/O emulation in KVM/Qemu. In Xen, one of my old P4 3.8GHZ
>> platform takes about 50-60K cycles. We can see how many is it in KVM.
>> BTW, today Linux kernel is no longer 1000HZ :-)
>> thx,eddie
>> 
> 
> There's some (old) data here:
> 
>   http://virt.kernelnewbies.org/KVM/Performance
> 
> showing pio latency of ~5600 cycles.  Note that this is on AMD, which
> takes less cycles to switch than the P4, but on the other hand, we
> still do a save/restore of the fpu state on every exit, so we can
> speed it up even more.

That is great I/O performance difference between Xen & KVM though the
sample data 
is too small (100 delta's were used). We can try with more samples to
let it last for several 
minutes to include many scheduler events that we used before.
On the otherhand, it proves that context switch between KVM and Qemu is
quit lightweight
given that there is no domain switch and even no guest task switch in
most case.
Thanks, eddie

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                     ` <46150F4F.4030505-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-05 15:16                                                       ` Dong, Eddie
@ 2007-04-05 15:22                                                       ` Anthony Liguori
  1 sibling, 0 replies; 81+ messages in thread
From: Anthony Liguori @ 2007-04-05 15:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Dong, Eddie wrote:
>> Avi Kivity wrote:
>>  
>>>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ linux
>>>> kernel, KB gets 14% gain. We also did a shared PIC model which share
>>>>  PIC state among Qemu & VMM with less LOC in VMM, it can get
>>>> similar performance gain (5.8% in my test).
>>>> BTW, at that time, PIT is in VMM already.
>>>>
>>>>       
>>> I expect that the gain in kvm will be smaller.  Xen has to schedule
>>> dom0 to process the event channel (possibly on another cpu), dom0 has
>>> to schedule qemu-dm (again, possibly on another cpu), qemu does its
>>> thing, and then Xen has to schedule domU again.  With kvm, we are
>>> always on the same cpu, and the only overhead is the system call,
>>> which is a few hundred nanoseconds.  I expect with current hardware
>>> that it will be negligible (as a vmexit is measured in microseconds),
>>> but to become measurable as hardware improves.
>>>     
>> Yes very possible.
>> We can take a quick mesurement to see how many cycles are spent in a
>> dummy
>> I/O emulation in KVM/Qemu. In Xen, one of my old P4 3.8GHZ platform
>> takes about 50-60K cycles. We can see how many is it in KVM.
>> BTW, today Linux kernel is no longer 1000HZ :-)
>> thx,eddie
>>   
>
> There's some (old) data here:
>
>  http://virt.kernelnewbies.org/KVM/Performance
>
> showing pio latency of ~5600 cycles.  Note that this is on AMD, which 
> takes less cycles to switch than the P4, but on the other hand, we 
> still do a save/restore of the fpu state on every exit, so we can 
> speed it up even more.

It varies quite a lot.  On more modern Intel processors, the PIO latency 
is pretty good.  Also, the number on the wiki is worse than what there 
is today as back then there was an extra syscall pair in the PIO path.

I posted a timeline a while ago on kvm-devel.  IIRC, going to userspace 
added < 1k cycles.  However, this will get much more pronounced as we 
improve the VMEXIT latency in KVM.

Right now we save a lot of state that we strictly don't have to.  Avi 
mentioned the FPU state but we also save many MSRs on every exit that we 
strictly only have to save/restore when we lose the CPU (such as the 
sysenter MSRs).  The saving and restoring of these MSRs make up the bulk 
of the (avoidable) VMEXIT overhead.  Eliminating these will actually 
make the difference between handling in-kernel and userspace much more 
pronounced.

Xen PIO latency is definitely much worse (at least a factor of 3).  
However, what really kills Xen is that occasionally an undesirable 
scheduling decision is made and the PIO latency jumps to an enormous 
value.  This ends making the mean PIO latency considerably worse than 
the median latency.

Regards,

Anthony Liguori

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                         ` <0E6FE5D295DE5B4B8D9070C26A227987F24452-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2007-04-05 15:26                                                           ` Anthony Liguori
  0 siblings, 0 replies; 81+ messages in thread
From: Anthony Liguori @ 2007-04-05 15:26 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dong, Eddie wrote:
> Avi Kivity wrote:
>   
>> Dong, Eddie wrote:
>>     
>>> Avi Kivity wrote:
>>>
>>>       
>>>>> With PIC in Xen, CPU2K gets 6.5% performance gain in old 1000HZ
>>>>> linux kernel, KB gets 14% gain. We also did a shared PIC model
>>>>>  which share PIC state among Qemu & VMM with less LOC in VMM, it
>>>>> can get 
>>>>> similar performance gain (5.8% in my test).
>>>>> BTW, at that time, PIT is in VMM already.
>>>>>
>>>>>
>>>>>           
>>>> I expect that the gain in kvm will be smaller.  Xen has to schedule
>>>> dom0 to process the event channel (possibly on another cpu), dom0
>>>> has to schedule qemu-dm (again, possibly on another cpu), qemu does
>>>> its thing, and then Xen has to schedule domU again.  With kvm, we
>>>> are always on the same cpu, and the only overhead is the system
>>>> call, which is a few hundred nanoseconds.  I expect with current
>>>> hardware that it will be negligible (as a vmexit is measured in
>>>> microseconds), but to become measurable as hardware improves.
>>>>
>>>>         
>>> Yes very possible.
>>> We can take a quick mesurement to see how many cycles are spent in a
>>> dummy I/O emulation in KVM/Qemu. In Xen, one of my old P4 3.8GHZ
>>> platform takes about 50-60K cycles. We can see how many is it in KVM.
>>> BTW, today Linux kernel is no longer 1000HZ :-)
>>> thx,eddie
>>>
>>>       
>> There's some (old) data here:
>>
>>   http://virt.kernelnewbies.org/KVM/Performance
>>
>> showing pio latency of ~5600 cycles.  Note that this is on AMD, which
>> takes less cycles to switch than the P4, but on the other hand, we
>> still do a save/restore of the fpu state on every exit, so we can
>> speed it up even more.
>>     
>
> That is great I/O performance difference between Xen & KVM though the
> sample data 
> is too small (100 delta's were used). We can try with more samples to
> let it last for several 
> minutes to include many scheduler events that we used before.
> On the otherhand, it proves that context switch between KVM and Qemu is
> quit lightweight
> given that there is no domain switch and even no guest task switch in
> most case.
>   

You may want to take a look at virtbench.  It has quite a good number of 
microbenchmarks specifically designed for virtualized environments.

http://ozlabs.org/~rusty/virtbench

I use "local" for measuring KVM/Xen.

Regards,

Anthony Liguori

> Thanks, eddie
>
>   


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05  7:17                                                   ` Avi Kivity
@ 2007-04-06  1:02                                                     ` Rusty Russell
  2007-04-08  5:36                                                       ` Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-06  1:02 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, netdev

On Thu, 2007-04-05 at 10:17 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > You didn't quote Anthony's point about "it's more about there not being
> > good enough userspace interfaces to do network IO."
> >
> > It's easier to write a kernel-space network driver, but it's not
> > obviously the right thing to do until we can show that an efficient
> > packet-level userspace interface isn't possible.  I don't think that's
> > been done, and it would be interesting to try.
> >   
> 
> In the case of networking, the copyful interfaces on receive are driven 
> by the hardware not knowing how to split the header from the data.  On 
> transmit I agree, it could be made copyless from userspace (somthing 
> like sendfilev, only not file oriented).

Hi Avi,

	I don't think you've thought about this very hard.  The receive copy is
completely independent with whether the packet is going to the guest via
a kernel driver or via userspace, so not relevant.

	And if all packets from the card are going to the guest, you can
deliver directly.  Userspace or kernel, no difference.

	And we have a "sendfilev not file oriented": it's called "writev" 8)

	An in-kernel driver can avoid system call overhead and page references.
But a better tap device helps more than just KVM.

Rusty.



^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05 11:36                                                                 ` Ingo Molnar
@ 2007-04-06  1:16                                                                   ` Rusty Russell
  2007-04-06 18:59                                                                     ` Ingo Molnar
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-06  1:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Avi Kivity, kvm-devel, netdev

On Thu, 2007-04-05 at 13:36 +0200, Ingo Molnar wrote:
> prototyping new kernel APIs to implement user-space network drivers, on 
> a crufty codebase is not something that should be done lightly.

I think you overestimate my radicalism.  I was considering readv() and
writev() on the tap device.

Qemu's infrastructure may hurt kvm here, but lguest won't be able to use
that excuse.

> track issue for the *PIC question at 
> hand. PICs are not network devices, they are essential platform 
> components and almost an extended part of the CPU.)

Definitely, I'm only interested in stealing^H^H^Hsharing KVM devices.
The subject is now deeply misleading 8(

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-05 14:32                                                       ` [kvm-devel] " Anthony Liguori
@ 2007-04-06 10:37                                                         ` Ingo Molnar
  2007-04-06 11:07                                                           ` Ingo Molnar
  0 siblings, 1 reply; 81+ messages in thread
From: Ingo Molnar @ 2007-04-06 10:37 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Rusty Russell, kvm-devel, netdev


* Anthony Liguori <aliguori@us.ibm.com> wrote:

> [...] Did Linux have extremely high quality code in 1994?

yes! It was crutial to strive for extremely high quality code all the 
time. That was the only way to grow Linux's codebase, which was ~300,000 
lines of code in 1994, to the current 7.2+ million lines of code, 
without losing maintainability. Code quality is more important than any 
feature. 99% of feature patches sent to lkml get rejected in the first 
review round on quality/design grounds, it always takes at least a 
couple of iterations to make it nice and clean. Look at Apache, it's 
evolving along the same lines. Or Samba. Or any of the really large and 
important OSS projects. (even X, after years of struggle and stagnation, 
seems to have gotten this point now.) In the past 10 years the OSS 
community wrote more than 1 billion lines of new code (!), and all the 
successful projects have a clean codebase. _It cannot be done any other 
way_, because cleanliness and pride over good code is what keeps 
developers and it is what attracts new developers.

now this doesnt mean that Linux's code quality is good in every spot - 
it's an eternal fight. But the core subsystems are pretty damn clean. 
When i prepare patches for the Linux kernel more than 50% of the work i 
do is related to making the changes clean, or cleaning up some existing 
aspect of the kernel that the new code triggers. Often it's 90% of the 
work!

the 'get functionality now, clean up later' mentality is what leads to 
throwaway, use-once codebases that the majority of closed-source 
projects do. Once the cruft level reaches a certain threshold it's 
cheaper to just throw away old code and just rewrite the whole thing 
(users and costs be damned). Cleanups must not be an afterthought, code 
cleanliness and gradual code evolution is _the_ most valuable property 
of OSS codebases.

i guess my negative Qemu experience is dominated by my recent failure of 
trying to untangle its timer code, so that qemu properly adopts to 
changes in PIT/lapic programming and maps that correctly to OS timers. 
(so that a dynticks/NO_HZ guest's reduced irq rate becomes visible on 
the host too) I'll be a happy camper if that's fixed ;-)

	Ingo

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-06 10:37                                                         ` Ingo Molnar
@ 2007-04-06 11:07                                                           ` Ingo Molnar
  0 siblings, 0 replies; 81+ messages in thread
From: Ingo Molnar @ 2007-04-06 11:07 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Rusty Russell, kvm-devel, netdev


* Ingo Molnar <mingo@elte.hu> wrote:

> * Anthony Liguori <aliguori@us.ibm.com> wrote:
> 
> > [...] Did Linux have extremely high quality code in 1994?
> 
> yes! It was crutial to strive for extremely high quality code all the 
> time. That was the only way to grow Linux's codebase, which was 
> ~300,000 lines of code in 1994, to the current 7.2+ million lines of 
> code, without losing maintainability. [...]

in fact Linux 1.0, released in early 1994, was only 170,000 LOC:

  http://www.kernel.org/pub/linux/kernel/v1.0/linux-1.0.tar.gz

and i just looked at a few random files in it - it's pretty clean.

	Ingo

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-06  1:16                                                                   ` [kvm-devel] " Rusty Russell
@ 2007-04-06 18:59                                                                     ` Ingo Molnar
  0 siblings, 0 replies; 81+ messages in thread
From: Ingo Molnar @ 2007-04-06 18:59 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Avi Kivity, kvm-devel, netdev


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> > prototyping new kernel APIs to implement user-space network drivers, 
> > on a crufty codebase is not something that should be done lightly.
> 
> I think you overestimate my radicalism.  I was considering readv() and 
> writev() on the tap device.

ok :-) How would packeting be handled, or would this be alike a raw 
socket in essence, but not in 'peek' but 'filter through' mode? I think 
it's not quite trivial. (but maybe i'm way too radical again :)

	Ingo

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-06  1:02                                                     ` Rusty Russell
@ 2007-04-08  5:36                                                       ` Avi Kivity
       [not found]                                                         ` <46187F4E.1080807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-08  5:36 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, netdev

Rusty Russell wrote:
> On Thu, 2007-04-05 at 10:17 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> You didn't quote Anthony's point about "it's more about there not being
>>> good enough userspace interfaces to do network IO."
>>>
>>> It's easier to write a kernel-space network driver, but it's not
>>> obviously the right thing to do until we can show that an efficient
>>> packet-level userspace interface isn't possible.  I don't think that's
>>> been done, and it would be interesting to try.
>>>   
>>>       
>> In the case of networking, the copyful interfaces on receive are driven 
>> by the hardware not knowing how to split the header from the data.  On 
>> transmit I agree, it could be made copyless from userspace (somthing 
>> like sendfilev, only not file oriented).
>>     
>
> Hi Avi,
>
> 	I don't think you've thought about this very hard.  The receive copy is
> completely independent with whether the packet is going to the guest via
> a kernel driver or via userspace, so not relevant.
>   

A packet received in the kernel cannot be made available to userspace in 
a safe manner without a copy, as it will not be aligned with page 
boundaries, so userspace cannot examine the packet until after one copy 
has occured.  After userspace has determined what to do with the packet, 
another copy must take place to get it there.

There's a counterexample, mmapped sockets, but that works only when all 
packets arriving on a card are exposed to the same process.  This is 
useful for tcpdump or for what you outline below but is hardly generic.

> 	And if all packets from the card are going to the guest, you can
> deliver directly.  Userspace or kernel, no difference.
>   

That is not the common case.  Nor is it true when there is a mismatch 
between the card's capabilties and guest expectations and constraints.  
For example, guest memory is not physically contiguous so a NIC that 
won't do scatter/gather will require bouncing (or an iommu, but that's 
not here yet).

> 	And we have a "sendfilev not file oriented": it's called "writev" 8)
>   

writev() cannot be made copyless for networking.  One needs an async 
interface so the kernel can complete the write after the NIC acks the 
dma transfer, or a kernel driver.

> 	An in-kernel driver can avoid system call overhead and page references.
> But a better tap device helps more than just KVM.
>   

I'll believe it when I see it.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                         ` <46187F4E.1080807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-08  9:04                                                           ` Muli Ben-Yehuda
  2007-04-09  2:50                                                           ` Rusty Russell
  1 sibling, 0 replies; 81+ messages in thread
From: Muli Ben-Yehuda @ 2007-04-08  9:04 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Sun, Apr 08, 2007 at 08:36:14AM +0300, Avi Kivity wrote:

> That is not the common case.  Nor is it true when there is a
> mismatch between the card's capabilties and guest expectations and
> constraints.  For example, guest memory is not physically contiguous
> so a NIC that won't do scatter/gather will require bouncing (or an
> iommu, but that's not here yet).

Actually, Allen Key from Intel just posted the first VT-d patches to
xen-devel a couple of days ago. I wonder if anyone is working on kvm
support (which would require Linux support).

Cheers,
Muli

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                         ` <46187F4E.1080807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-08  9:04                                                           ` Muli Ben-Yehuda
@ 2007-04-09  2:50                                                           ` Rusty Russell
       [not found]                                                             ` <1176087018.11664.65.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-09  2:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Sun, 2007-04-08 at 08:36 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > Hi Avi,
> >
> > 	I don't think you've thought about this very hard.  The receive copy is
> > completely independent with whether the packet is going to the guest via
> > a kernel driver or via userspace, so not relevant.
> >   
> 
> A packet received in the kernel cannot be made available to userspace in 
> a safe manner without a copy, as it will not be aligned with page 
> boundaries, so userspace cannot examine the packet until after one copy 
> has occured.

Hi Avi!

	I'm a little puzzled by your response.  Hmm...

	lguest's userspace network frontend does exactly as many copies as
Ingo's in-host-kernel code.  One from the Guest, one to the Guest.

Does that clarify?
Rusty.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                             ` <1176087018.11664.65.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-09  7:10                                                               ` Avi Kivity
       [not found]                                                                 ` <4619E6DC.3010804-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-09  7:10 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Rusty Russell wrote:
> On Sun, 2007-04-08 at 08:36 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> Hi Avi,
>>>
>>> 	I don't think you've thought about this very hard.  The receive copy is
>>> completely independent with whether the packet is going to the guest via
>>> a kernel driver or via userspace, so not relevant.
>>>   
>>>       
>> A packet received in the kernel cannot be made available to userspace in 
>> a safe manner without a copy, as it will not be aligned with page 
>> boundaries, so userspace cannot examine the packet until after one copy 
>> has occured.
>>     
>
> Hi Avi!
>
> 	I'm a little puzzled by your response.  Hmm...
>
> 	lguest's userspace network frontend does exactly as many copies as
> Ingo's in-host-kernel code.  One from the Guest, one to the Guest.
>
>   

kvm pvnet is suboptimal now.  The number of copies could be reduced by 
two (to zero), by constructing an skb that points to guest memory.  
Right now, this can only be done in-kernel.

With current userspace networking interfaces, one cannot build a network 
device that has less than one copy on transmit, because sendmsg() *must* 
copy the data (as there is no completion notification).  sendfilev(), 
even if it existed, cannot be used: it is copyless, but lacks completion 
notification.  It is useful only on unchanging data like read-only files.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                 ` <4619E6DC.3010804-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-09  9:46                                                                   ` Rusty Russell
       [not found]                                                                     ` <1176111984.11664.90.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-09  9:46 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Mon, 2007-04-09 at 10:10 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > 	I'm a little puzzled by your response.  Hmm...
> >
> > 	lguest's userspace network frontend does exactly as many copies as
> > Ingo's in-host-kernel code.  One from the Guest, one to the Guest.
> 
> kvm pvnet is suboptimal now.  The number of copies could be reduced by 
> two (to zero), by constructing an skb that points to guest memory.  
> Right now, this can only be done in-kernel.

Sorry, you lost me here.  You mean both input and output copies can be
eliminated?  Or are you talking about another two copies somewhere?

But I don't get this "we can enhance the kernel but not userspace" vibe
8(

> With current userspace networking interfaces, one cannot build a network 
> device that has less than one copy on transmit, because sendmsg() *must* 
> copy the data (as there is no completion notification).

Why are you talking about sendmsg()?  Perhaps this is where we're
getting tangled up.

We're dealing with the tun/tap device here, not a socket.

>  sendfilev(), 
> even if it existed, cannot be used: it is copyless, but lacks completion 
> notification.  It is useful only on unchanging data like read-only files.

Again, sendfile is a *much* harder problem than sending a single packet
once, which is the question here.

Rusty.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                     ` <1176111984.11664.90.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-09 13:38                                                                       ` Avi Kivity
       [not found]                                                                         ` <461A41CA.9080201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-04-11  3:53                                                                         ` [kvm-devel] " Rusty Russell
  0 siblings, 2 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-09 13:38 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Rusty Russell wrote:
> On Mon, 2007-04-09 at 10:10 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> 	I'm a little puzzled by your response.  Hmm...
>>>
>>> 	lguest's userspace network frontend does exactly as many copies as
>>> Ingo's in-host-kernel code.  One from the Guest, one to the Guest.
>>>       
>> kvm pvnet is suboptimal now.  The number of copies could be reduced by 
>> two (to zero), by constructing an skb that points to guest memory.  
>> Right now, this can only be done in-kernel.
>>     
>
> Sorry, you lost me here.  You mean both input and output copies can be
> eliminated?  Or are you talking about another two copies somewhere?
>   

On the transmit path, current kvm pvnet has two copies:

1.  on the guest side, the driver copies the skb data into the shared ring
2. on the host side, the device copies the data from the ring into a 
newly allocated skb

Both of these copies can be eliminated with a host-side kernel.  With 
current userspace interfaces, only one copy can be eliminated.

Similar logic applies to receive, except that one copy must remain.

> But I don't get this "we can enhance the kernel but not userspace" vibe
> 8(
>   

I've been waiting for network aio since ~2003.  If it arrives in the 
next few days, I'm all for it; much more than kvm can use it 
profitably.  But I'm not going to write that interface myself.

Moreover, some things just don't lend themselves to a userspace 
abstraction.  If we want to expose tso (tcp segmentation offload), we 
can easily do so with a kernel driver since the kernel interfaces are 
all tso aware.  Tacking on tso awareness to tun/tap is doable, but at 
the very least wierd.

>   
>> With current userspace networking interfaces, one cannot build a network 
>> device that has less than one copy on transmit, because sendmsg() *must* 
>> copy the data (as there is no completion notification).
>>     
>
> Why are you talking about sendmsg()?  Perhaps this is where we're
> getting tangled up.
>
> We're dealing with the tun/tap device here, not a socket.
>
>   

Hmm.  tun actually has aio_write implemented, but it seems synchronous.  
So does the read path.

If these are made truly asynchronous, and the write path is made in 
addition copyless, then we might have something workable.  I still 
cringe at having a pagetable walk in order to deliver a 1500-byte packet.


>>  sendfilev(), 
>> even if it existed, cannot be used: it is copyless, but lacks completion 
>> notification.  It is useful only on unchanging data like read-only files.
>>     
>
> Again, sendfile is a *much* harder problem than sending a single packet
> once, which is the question here.
>   

sendfile() is a *different* problem.  It doesn't need completion because 
the data is assumed not to change under it.

Consider that the guest may be issuing a megabyte-sized sendfile() which 
is broken into 17 tso frames.  We need to preserve the large structures 
as much as possible or we end up repeating the simple "single packet 
once" path 700 times.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                         ` <461A41CA.9080201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-10  8:07                                                                           ` Evgeniy Polyakov
  2007-04-10  8:19                                                                             ` [kvm-devel] " Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Evgeniy Polyakov @ 2007-04-10  8:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Mon, Apr 09, 2007 at 04:38:18PM +0300, Avi Kivity (avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org) wrote:
> >But I don't get this "we can enhance the kernel but not userspace" vibe
> >8(
> >  
> 
> I've been waiting for network aio since ~2003.  If it arrives in the 
> next few days, I'm all for it; much more than kvm can use it 
> profitably.  But I'm not going to write that interface myself.

Hmm, you missed at least two implementations of network aio in the 
previous year, and now with syslets we can have third one.

But it looks from this discussion, that it will not prevent from
changing in-kernel driver - place a hook into skb allocation path and
allocate data from opposing memory - get pages from another side and put
them into fragments, then copy headers into skb->data.

-- 
	Evgeniy Polyakov

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-10  8:07                                                                           ` Evgeniy Polyakov
@ 2007-04-10  8:19                                                                             ` Avi Kivity
       [not found]                                                                               ` <461B48A8.1060904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-10  8:19 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Rusty Russell, Ingo Molnar, kvm-devel, netdev

Evgeniy Polyakov wrote:
> On Mon, Apr 09, 2007 at 04:38:18PM +0300, Avi Kivity (avi@qumranet.com) wrote:
>   
>>> But I don't get this "we can enhance the kernel but not userspace" vibe
>>> 8(
>>>  
>>>       
>> I've been waiting for network aio since ~2003.  If it arrives in the 
>> next few days, I'm all for it; much more than kvm can use it 
>> profitably.  But I'm not going to write that interface myself.
>>     
>
> Hmm, you missed at least two implementations of network aio in the 
> previous year, and now with syslets we can have third one.
>   

I meant, network aio in the mainline kernel.  I am aware of the various
out-of-tree implementations.

> But it looks from this discussion, that it will not prevent from
> changing in-kernel driver - place a hook into skb allocation path and
> allocate data from opposing memory - get pages from another side and put
> them into fragments, then copy headers into skb->data.
>   

I don't understand this (opposing memory, another side?).  Can you
elaborate?

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                               ` <461B48A8.1060904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-10  8:58                                                                                 ` Evgeniy Polyakov
  2007-04-10 11:21                                                                                   ` [kvm-devel] " Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Evgeniy Polyakov @ 2007-04-10  8:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Tue, Apr 10, 2007 at 11:19:52AM +0300, Avi Kivity (avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org) wrote:
> I meant, network aio in the mainline kernel.  I am aware of the various
> out-of-tree implementations.

If potential users do not pay attention to initial implementaion, it is
quite hard to them to get into. But actually it does not matter to this
discussion.

> > But it looks from this discussion, that it will not prevent from
> > changing in-kernel driver - place a hook into skb allocation path and
> > allocate data from opposing memory - get pages from another side and put
> > them into fragments, then copy headers into skb->data.
> >   
> 
> I don't understand this (opposing memory, another side?).  Can you
> elaborate?

You want to implement zero-copy network device between host and guest, if
I understood this thread correctly?
So, for sending part, device allocates pages from receiver's memory (or
from shared memory), receiver gets an 'interrupt' and got pages from own
memory, which are attached to new skb and transferred up to the network
stack.
It can be extended to use shared ring of pages.

-- 
	Evgeniy Polyakov

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-10  8:58                                                                                 ` Evgeniy Polyakov
@ 2007-04-10 11:21                                                                                   ` Avi Kivity
       [not found]                                                                                     ` <461B7334.8090807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-10 11:21 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Rusty Russell, Ingo Molnar, kvm-devel, netdev

Evgeniy Polyakov wrote:
>>> But it looks from this discussion, that it will not prevent from
>>> changing in-kernel driver - place a hook into skb allocation path and
>>> allocate data from opposing memory - get pages from another side and put
>>> them into fragments, then copy headers into skb->data.
>>>   
>>>       
>> I don't understand this (opposing memory, another side?).  Can you
>> elaborate?
>>     
>
> You want to implement zero-copy network device between host and guest, if
> I understood this thread correctly?
> So, for sending part, device allocates pages from receiver's memory (or
> from shared memory), receiver gets an 'interrupt' and got pages from own
> memory, which are attached to new skb and transferred up to the network
> stack.
> It can be extended to use shared ring of pages.
>   

This is what Xen does.  It is actually less performant than copying, IIRC.

The problem with flipping pages around is that physical addresses are 
cached both in the kvm mmu and in the on-chip tlbs, necessitating 
expensive page table walks and tlb invalidation IPIs.

Note that for sending from the guest an external host can be done 
copylessly, and for the receive side using a dma engine (like I/OAT) can 
reduce the cost of the copy.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                     ` <461B7334.8090807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-10 11:50                                                                                       ` Evgeniy Polyakov
  2007-04-10 12:17                                                                                         ` [kvm-devel] " Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Evgeniy Polyakov @ 2007-04-10 11:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Tue, Apr 10, 2007 at 02:21:24PM +0300, Avi Kivity (avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org) wrote:
> >You want to implement zero-copy network device between host and guest, if
> >I understood this thread correctly?
> >So, for sending part, device allocates pages from receiver's memory (or
> >from shared memory), receiver gets an 'interrupt' and got pages from own
> >memory, which are attached to new skb and transferred up to the network
> >stack.
> >It can be extended to use shared ring of pages.
> >  
> 
> This is what Xen does.  It is actually less performant than copying, IIRC.
> 
> The problem with flipping pages around is that physical addresses are 
> cached both in the kvm mmu and in the on-chip tlbs, necessitating 
> expensive page table walks and tlb invalidation IPIs.

Hmm, I'm not familiar with Xen driver, but similar technique was used
with zero-copy network sniffer some time ago, substituting userspace
pages with pages containing skb data was about 25-50% faster than
copying 1500 bytes in general, and in order of 10 times faster in some
cases.

Check a link please in case we are talking about different ideas:
http://marc.info/?l=linux-netdev&m=112262743505711&w=2

-- 
	Evgeniy Polyakov

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-10 11:50                                                                                       ` Evgeniy Polyakov
@ 2007-04-10 12:17                                                                                         ` Avi Kivity
       [not found]                                                                                           ` <461B8069.6070007-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-10 12:17 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Rusty Russell, Ingo Molnar, kvm-devel, netdev

Evgeniy Polyakov wrote:
>> This is what Xen does.  It is actually less performant than copying, IIRC.
>>
>> The problem with flipping pages around is that physical addresses are 
>> cached both in the kvm mmu and in the on-chip tlbs, necessitating 
>> expensive page table walks and tlb invalidation IPIs.
>>     
>
> Hmm, I'm not familiar with Xen driver, but similar technique was used
> with zero-copy network sniffer some time ago, substituting userspace
> pages with pages containing skb data was about 25-50% faster than
> copying 1500 bytes in general, and in order of 10 times faster in some
> cases.
>
> Check a link please in case we are talking about different ideas:
> http://marc.info/?l=linux-netdev&m=112262743505711&w=2
>
>   

I don't really understand what you're testing there.  in particular, how 
can the copying time change so dramatically depending on whether you've 
just rebooted or not?



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                           ` <461B8069.6070007-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-10 12:30                                                                                             ` Evgeniy Polyakov
       [not found]                                                                                               ` <20070410123034.GA11493-9fLWQ3dKdXwox3rIn2DAYQ@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Evgeniy Polyakov @ 2007-04-10 12:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Tue, Apr 10, 2007 at 03:17:45PM +0300, Avi Kivity (avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org) wrote:
> >Check a link please in case we are talking about different ideas:
> >http://marc.info/?l=linux-netdev&m=112262743505711&w=2
> >
> >  
> 
> I don't really understand what you're testing there.  in particular, how 
> can the copying time change so dramatically depending on whether you've 
> just rebooted or not?
 
I tested page remapping time - i.e. time to replace a page in two
different mappings - the same should be performed in host and guest
kernels if such design is going to be used for communication.

I can only explain after-reboot slow copy with empty caches - arbitrary
kernel pages were copied into buffer (not the same data as in posted
code).

-- 
	Evgeniy Polyakov

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                               ` <20070410123034.GA11493-9fLWQ3dKdXwox3rIn2DAYQ@public.gmane.org>
@ 2007-04-10 12:49                                                                                                 ` Avi Kivity
  0 siblings, 0 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-10 12:49 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Evgeniy Polyakov wrote:
> On Tue, Apr 10, 2007 at 03:17:45PM +0300, Avi Kivity (avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org) wrote:
>   
>>> Check a link please in case we are talking about different ideas:
>>> http://marc.info/?l=linux-netdev&m=112262743505711&w=2
>>>
>>>  
>>>       
>> I don't really understand what you're testing there.  in particular, how 
>> can the copying time change so dramatically depending on whether you've 
>> just rebooted or not?
>>     
>  
> I tested page remapping time - i.e. time to replace a page in two
> different mappings - the same should be performed in host and guest
> kernels if such design is going to be used for communication.
>
> I can only explain after-reboot slow copy with empty caches - arbitrary
> kernel pages were copied into buffer (not the same data as in posted
> code).
>   

Doing this in kvm would be significantly more complex, as we'd need to 
use full reverse mapping to locate all guest mappings (we already 
reverse map writable pages for other reasons), so the 25-50% difference 
might be nullified or even turn into overhead.

Here are the Xen numbers for reference.  Xen probably has more overhead 
than kvm for such things, though, as it needs to do hypercalls from dom0 
which is in-kernel for kvm.

http://lists.xensource.com/archives/html/xen-devel/2007-03/msg01218.html

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-09 13:38                                                                       ` Avi Kivity
       [not found]                                                                         ` <461A41CA.9080201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-11  3:53                                                                         ` Rusty Russell
       [not found]                                                                           ` <1176263593.26372.84.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  1 sibling, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-11  3:53 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, netdev

On Mon, 2007-04-09 at 16:38 +0300, Avi Kivity wrote:
> Moreover, some things just don't lend themselves to a userspace 
> abstraction.  If we want to expose tso (tcp segmentation offload), we 
> can easily do so with a kernel driver since the kernel interfaces are 
> all tso aware.  Tacking on tso awareness to tun/tap is doable, but at 
> the very least wierd.

It is kinda weird, yes, but it certainly makes sense.  All the arguments
for tso apply in triplicate to userspace packet sends...

> > We're dealing with the tun/tap device here, not a socket.
> 
> Hmm.  tun actually has aio_write implemented, but it seems synchronous.  
> So does the read path.
> 
> If these are made truly asynchronous, and the write path is made in 
> addition copyless, then we might have something workable.  I still 
> cringe at having a pagetable walk in order to deliver a 1500-byte packet.

Right, now we're talking!

However, it's not clear to me why creating an skb which references a kvm
guest's memory doesn't need a pagetable walk, but a packet in (other)
userspace memory does?

My conviction which started this discussion is that if we can offer an
efficient interface for kvm, we should be able to offer an efficient
interface for any (other) userspace.

As to async, I'm not *so* worried about that for the moment, although it
would probably be nicer to fail than to block.  Otherwise we could
simply set an skb destructor to wake us up.

> > Again, sendfile is a *much* harder problem than sending a single packet
> > once, which is the question here.
> 
> sendfile() is a *different* problem.  It doesn't need completion because 
> the data is assumed not to change under it.

Well, let's not argue over that, it's irrelevant.  Hopefully we can do
that over a beer or equivalent sometime.

I think the first step is to see how much worse a decent userspace net
driver is compared with the current in-kernel one.

Rusty.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                           ` <1176263593.26372.84.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-11  4:26                                                                             ` Avi Kivity
       [not found]                                                                               ` <461C6360.1060908-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-11  4:26 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Rusty Russell wrote:
> On Mon, 2007-04-09 at 16:38 +0300, Avi Kivity wrote:
>   
>> Moreover, some things just don't lend themselves to a userspace 
>> abstraction.  If we want to expose tso (tcp segmentation offload), we 
>> can easily do so with a kernel driver since the kernel interfaces are 
>> all tso aware.  Tacking on tso awareness to tun/tap is doable, but at 
>> the very least wierd.
>>     
>
> It is kinda weird, yes, but it certainly makes sense.  All the arguments
> for tso apply in triplicate to userspace packet sends...
>
>   

Well, write() with a large buffer is a sort of tso device.  The problem
is tso breaks through several layers (like I'm advocating in the other
thread :), pushing tcp functionality into ethernet.  Well, we've seen worse.


>>> We're dealing with the tun/tap device here, not a socket.
>>>       
>> Hmm.  tun actually has aio_write implemented, but it seems synchronous.  
>> So does the read path.
>>
>> If these are made truly asynchronous, and the write path is made in 
>> addition copyless, then we might have something workable.  I still 
>> cringe at having a pagetable walk in order to deliver a 1500-byte packet.
>>     
>
> Right, now we're talking!
>
> However, it's not clear to me why creating an skb which references a kvm
> guest's memory doesn't need a pagetable walk, but a packet in (other)
> userspace memory does?
>   

Currently guest pages are stashed in a kernel array, as well as being
mmap()ed into user space.

That's not a very strong argument though, as I'd like to be map
userspace memory into the guest, or map address_spaces to the guest, or
something, so accessing guest physical memory will become more expensive
in time.

> My conviction which started this discussion is that if we can offer an
> efficient interface for kvm, we should be able to offer an efficient
> interface for any (other) userspace.
>   

Fully agreed.  It's mostly a question of who and when.  Designing and
implementing this interface is going to be difficult, require deep
knowledge of Linux networking, and consume a lot of time.

> As to async, I'm not *so* worried about that for the moment, although it
> would probably be nicer to fail than to block.  Otherwise we could
> simply set an skb destructor to wake us up.
>   

Nope.  Being async is critical for copyless networking:

- in the transmit path, so need to stop the sender (guest) from touching
the memory until it's on the wire.  This means 100% of packets sent will
be blocked.
- in the receive path, you could separate receive notification from the
single copy that must be done (like poll() + read()), but to make use of
dma engines you need to provide the end address beforehand.

> I think the first step is to see how much worse a decent userspace net
> driver is compared with the current in-kernel one.
>   

A userspace net interface needs to provide the following:

- true async operations
- multiple packets per operation (for interrupt mitigation) (like
lio_listio)
- scatter/gather packets (iovecs)
- configurable wakeup (by packet count/timeout) for queue management
- hacks (tso)

Most of these can be provided by a combination of the pending aio work,
the pending aio/fd integration, and the not-so-pending tap aio work.  As
the first two are available as patches and the third is limited to the
tap device, it is not unreasonable to try it out.  Maybe it will turn
out not to be as difficult as I predicted just a few lines above.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                               ` <461C6360.1060908-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-11 13:23                                                                                 ` Rusty Russell
       [not found]                                                                                   ` <1176297794.14322.72.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-11 13:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Wed, 2007-04-11 at 07:26 +0300, Avi Kivity wrote:
> Nope.  Being async is critical for copyless networking:
> 
> - in the transmit path, so need to stop the sender (guest) from touching
> the memory until it's on the wire.  This means 100% of packets sent will
> be blocked.

Hi Avi,

	You keep saying stuff like this, and I keep ignoring it.  OK, I'll
bite:

	Why would we try to prevent the sender from altering the packets?

> A userspace net interface needs to provide the following:
> 
> - true async operations

I'll hold on this pending discussion above.

> - multiple packets per operation (for interrupt mitigation) (like
> lio_listio)

The benefits for interrupt mitigation are less clear to me in a virtual
environment (scheduling tends to make it happen anyway); I'd want to
benchmark it.

Some kind of batching to reduce syscall overhead, perhaps, but TSO would
go a fair way towards that anyway (probably not enough).

> - scatter/gather packets (iovecs)

Yes, and this is already present in the tap device.  Anthony suggested a
slightly nasty hack for multiple sg packets in one writev()/readv, which
could also give us batching.

> - configurable wakeup (by packet count/timeout) for queue management

I'm not convinced that this is a showstopper, though.

> - hacks (tso)

I'd usually go for a batch interface over TSO, but if the card we're
sending to actually does TSO then TSO will probably win.

> Most of these can be provided by a combination of the pending aio work,
> the pending aio/fd integration, and the not-so-pending tap aio work.  As
> the first two are available as patches and the third is limited to the
> tap device, it is not unreasonable to try it out.  Maybe it will turn
> out not to be as difficult as I predicted just a few lines above.

Indeed, I don't think we're asking for a revolution a-la VJ-style
channels.  But I'm still itching to get back to that, and this might yet
provide an excuse 8)

Cheers,
Rusty.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                   ` <1176297794.14322.72.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-11 14:28                                                                                     ` Avi Kivity
       [not found]                                                                                       ` <461CF098.3090003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-11 14:28 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Rusty Russell wrote:
> On Wed, 2007-04-11 at 07:26 +0300, Avi Kivity wrote:
>   
>> Nope.  Being async is critical for copyless networking:
>>
>> - in the transmit path, so need to stop the sender (guest) from touching
>> the memory until it's on the wire.  This means 100% of packets sent will
>> be blocked.
>>     
>
> Hi Avi,
>
> 	You keep saying stuff like this, and I keep ignoring it.  OK, I'll
> bite:
>
> 	Why would we try to prevent the sender from altering the packets?
>
>   

To avoid data corruption.

The guest wants to send a packet.  It calls write(), which causes an skb 
to be allocated, data to be copied into it, the entire networking stack 
gets into gear, and the guest-side driver instructs the "device" to send 
the packet.

With async operations, the saga continues like this: the host-side 
driver allocates an skb, get_page()s and attaches the data to the new 
skb, this skb crosses the bridge, trickles into the real ethernet 
device, gets queued there, sent, interrupts fire, triggering async 
completion.  On this completion, we send a virtual interrupt to the 
guest, which tells it to destroy the skb and reclaim the pages attached 
to it.

Without async operations, we don't have a hook to notify the guest when 
to reclaim the skb.  If we do it too soon, the skb can be reclaimed and 
the memory reused before the real device gets to see it, so we end up 
sending data that we did not intend.  The only way to avoid it is to 
copy the data somewhere safe, but that is exactly what we don't want to do.

>> - multiple packets per operation (for interrupt mitigation) (like
>> lio_listio)
>>     
>
> The benefits for interrupt mitigation are less clear to me in a virtual
> environment (scheduling tends to make it happen anyway); I'd want to
> benchmark it.
>
>   

Yes, the guest will probably submit multiple packets in one hypercall.  
It would be nice for the userspace driver to be able to submit them to 
the host kernel in one syscall.

> Some kind of batching to reduce syscall overhead, perhaps, but TSO would
> go a fair way towards that anyway (probably not enough).
>
>   

For some workloads, sure.


>> - scatter/gather packets (iovecs)
>>     
>
> Yes, and this is already present in the tap device.  Anthony suggested a
> slightly nasty hack for multiple sg packets in one writev()/readv, which
> could also give us batching.
>
>   

No need for hacks if we get list aio support one day.

>> - configurable wakeup (by packet count/timeout) for queue management
>>     
>
> I'm not convinced that this is a showstopper, though.
>   

It probably isn't.  It's free with aio though.

>   
>> - hacks (tso)
>>     
>
> I'd usually go for a batch interface over TSO, but if the card we're
> sending to actually does TSO then TSO will probably win.
>   

Sure, if tso helps a regular host then it should help one that happens 
to be running a virtual machine.

>   
>> Most of these can be provided by a combination of the pending aio work,
>> the pending aio/fd integration, and the not-so-pending tap aio work.  As
>> the first two are available as patches and the third is limited to the
>> tap device, it is not unreasonable to try it out.  Maybe it will turn
>> out not to be as difficult as I predicted just a few lines above.
>>     
>
> Indeed, I don't think we're asking for a revolution a-la VJ-style
> channels.  But I'm still itching to get back to that, and this might yet
> provide an excuse 8)
>   

I'll be happy if this can be made to work.  It will make the paravirt 
guest-side driver work in kvm-less setups, which are useful for testing, 
and of course reduction in kernel code is beneficial.  It will be slower 
that in-kernel, but if we get the batching right, perhaps not 
significantly slower.  I'm mostly concerned that this depends on code 
that has eluded merging for such a long time.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                       ` <461CF098.3090003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-04-11 23:30                                                                                         ` Rusty Russell
       [not found]                                                                                           ` <1176334200.14322.133.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-11 23:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

On Wed, 2007-04-11 at 17:28 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > On Wed, 2007-04-11 at 07:26 +0300, Avi Kivity wrote:
> >   
> >> Nope.  Being async is critical for copyless networking:
> >>
> With async operations, the saga continues like this: the host-side 
> driver allocates an skb, get_page()s and attaches the data to the new 
> skb, this skb crosses the bridge, trickles into the real ethernet 
> device, gets queued there, sent, interrupts fire, triggering async 
> completion.  On this completion, we send a virtual interrupt to the 
> guest, which tells it to destroy the skb and reclaim the pages attached 
> to it.

Hi Avi!

	Thanks for spelling it out, I now understand your POV.  I had
considered it obvious that a (non-async) write which didn't copy would
block until the skb was finished with, which is easy to code up within
the tap device itself.  Otherwise it's actually an async write without a
notification mechanism, which I agree is broken.

	Note though: if the guest can change the packet headers they can
subvert some firewall rules and possibly crash the host.  None of the
networking code I wrote expects packets to change in flight 8(

	This applies to a userspace or kernelspace driver.

> > Yes, and this is already present in the tap device.  Anthony suggested a
> > slightly nasty hack for multiple sg packets in one writev()/readv, which
> > could also give us batching.
> 
> No need for hacks if we get list aio support one day.

As you point out though, aio is not something we want to hold our breath
for.  Plus, aio never makes things simpler, and complexity kills
puppies.

Cheers!
Rusty.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: QEMU PIC indirection patch for in-kernel APIC work
       [not found]                                                                                           ` <1176334200.14322.133.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-04-12  3:32                                                                                             ` Avi Kivity
  2007-04-16  0:22                                                                                               ` [kvm-devel] " Rusty Russell
  0 siblings, 1 reply; 81+ messages in thread
From: Avi Kivity @ 2007-04-12  3:32 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, netdev

Rusty Russell wrote:
> On Wed, 2007-04-11 at 17:28 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> On Wed, 2007-04-11 at 07:26 +0300, Avi Kivity wrote:
>>>   
>>>       
>>>> Nope.  Being async is critical for copyless networking:
>>>>
>>>>         
>> With async operations, the saga continues like this: the host-side 
>> driver allocates an skb, get_page()s and attaches the data to the new 
>> skb, this skb crosses the bridge, trickles into the real ethernet 
>> device, gets queued there, sent, interrupts fire, triggering async 
>> completion.  On this completion, we send a virtual interrupt to the 
>> guest, which tells it to destroy the skb and reclaim the pages attached 
>> to it.
>>     
>
> Hi Avi!
>
> 	Thanks for spelling it out, I now understand your POV.  I had
> considered it obvious that a (non-async) write which didn't copy would
> block until the skb was finished with, which is easy to code up within
> the tap device itself.  Otherwise it's actually an async write without a
> notification mechanism, which I agree is broken.
>
>   

I hadn't considered an always-blocking (or unbuffered) networking API. 
It's very counter to current APIs, but does make sense with things like
syslets.  Without syslets, I don't think it's very useful as you need
some artificial threads to keep things humming along.

(How would userspace specify it? O_DIRECT when opening the tap?)

I don't think there's a lot of difference between implementing aio or
always-blocking copyless writes for tap.  They just differ in how they
sleep and in how to access user pages.

> 	Note though: if the guest can change the packet headers they can
> subvert some firewall rules and possibly crash the host.  None of the
> networking code I wrote expects packets to change in flight 8(
>
> 	This applies to a userspace or kernelspace driver.
>
>   

Umm, right.  We could write-protect the packets (which would be very
expensive).  We could set the evil bit on guest-originated packets, and
rewrite the entire networking stack to copy any part which is inspected
if the evil bit is set.  We need more head-scratching on this.

>>> Yes, and this is already present in the tap device.  Anthony suggested a
>>> slightly nasty hack for multiple sg packets in one writev()/readv, which
>>> could also give us batching.
>>>       
>> No need for hacks if we get list aio support one day.
>>     
>
> As you point out though, aio is not something we want to hold our breath
> for.  Plus, aio never makes things simpler, and complexity kills
> puppies.
>   

The puppies had better stay away from qemu then, as it is completely async.

Always-blocking writes won't reduce complexity.  Suddenly you need a
thread for each request batch and some pleasant code for joining the
threads when done.  Syslets do make it go away, though they're more for
the mostly-nonblocking-with-occasional-blockage stuff rather than the
always blocking thingie you describe.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-12  3:32                                                                                             ` Avi Kivity
@ 2007-04-16  0:22                                                                                               ` Rusty Russell
  2007-04-16  5:13                                                                                                 ` Avi Kivity
  0 siblings, 1 reply; 81+ messages in thread
From: Rusty Russell @ 2007-04-16  0:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, netdev

On Thu, 2007-04-12 at 06:32 +0300, Avi Kivity wrote:
> I hadn't considered an always-blocking (or unbuffered) networking API. 
> It's very counter to current APIs, but does make sense with things like
> syslets.  Without syslets, I don't think it's very useful as you need
> some artificial threads to keep things humming along.
> 
> (How would userspace specify it? O_DIRECT when opening the tap?)

TBH, I hadn't thought that far.  Tap already has those IFF_NO_PI etc
flags, but it might make sense to just be the default.  From userspace's
POV it's not a semantic change.

OK, just tested: I can get 230,000 packets (28 byte UDP) through the tun
device in a second (130,000 actually out the 100-base-T NIC, 100,000
dropped).  If the tun driver's write() blocks until the skb is
destroyed, it's 4,000 packets.

So your intuition was right: skb_free latency on xmit (at least for this
e1000) is far too large for anything but an async solution.

Will ponder further.

Thanks!
Rusty.



^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [kvm-devel] QEMU PIC indirection patch for in-kernel APIC work
  2007-04-16  0:22                                                                                               ` [kvm-devel] " Rusty Russell
@ 2007-04-16  5:13                                                                                                 ` Avi Kivity
  0 siblings, 0 replies; 81+ messages in thread
From: Avi Kivity @ 2007-04-16  5:13 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, netdev

Rusty Russell wrote:
> On Thu, 2007-04-12 at 06:32 +0300, Avi Kivity wrote:
>   
>> I hadn't considered an always-blocking (or unbuffered) networking API. 
>> It's very counter to current APIs, but does make sense with things like
>> syslets.  Without syslets, I don't think it's very useful as you need
>> some artificial threads to keep things humming along.
>>
>> (How would userspace specify it? O_DIRECT when opening the tap?)
>>     
>
> TBH, I hadn't thought that far.  Tap already has those IFF_NO_PI etc
> flags, but it might make sense to just be the default.  From userspace's
> POV it's not a semantic change.
>
> OK, just tested: I can get 230,000 packets (28 byte UDP) through the tun
> device in a second (130,000 actually out the 100-base-T NIC, 100,000
> dropped).  If the tun driver's write() blocks until the skb is
> destroyed, it's 4,000 packets.
>
> So your intuition was right: skb_free latency on xmit (at least for this
> e1000) is far too large for anything but an async solution.
>
> Will ponder further.
>   

I think aio_write (but done copyless-lessly) is the way to go.  Not only
is the infrastructure there, but the API already allows for multiple
packet submission and for batching completions.  Fitting into that
framework ought to be easier than starting yet another one.

It still misses scatter/gather and integration with fd-based
notification, but there are patches around for that.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 81+ messages in thread

end of thread, other threads:[~2007-04-16  5:13 UTC | newest]

Thread overview: 81+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-02 11:46 QEMU PIC indirection patch for in-kernel APIC work Gregory Haskins
     [not found] ` <4610A6A9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-04  6:53   ` Avi Kivity
     [not found]     ` <46134B74.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 14:20       ` Anthony Liguori
     [not found]         ` <4613B438.60107-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 14:39           ` Avi Kivity
     [not found]             ` <4613B89F.8090806-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 14:55               ` Anthony Liguori
     [not found]                 ` <4613BC6B.1070708-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 15:06                   ` Avi Kivity
     [not found]                     ` <4613BF07.50606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 15:36                       ` Nakajima, Jun
     [not found]                         ` <8FFF7E42E93CC646B632AB40643802A8025B9580-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-04 15:47                           ` Dor Laor
     [not found]                             ` <64F9B87B6B770947A9F8391472E032160B318E96-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
2007-04-04 15:53                               ` Anthony Liguori
     [not found]                                 ` <4613C9EE.5030600-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 16:05                                   ` Avi Kivity
     [not found]                                     ` <4613CCD1.2070702-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 16:20                                       ` Nakajima, Jun
     [not found]                                         ` <8FFF7E42E93CC646B632AB40643802A8025B962E-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-04 16:27                                           ` Dor Laor
     [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B318EDB-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
2007-04-04 16:43                                               ` Anthony Liguori
     [not found]                                                 ` <4613D596.7080201-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 16:54                                                   ` Avi Kivity
2007-04-04 16:32                                           ` Avi Kivity
     [not found]                                             ` <4613D30E.7030905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 16:48                                               ` Nakajima, Jun
     [not found]                                                 ` <8FFF7E42E93CC646B632AB40643802A8025B96AA-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-04 16:58                                                   ` Avi Kivity
     [not found]                                                     ` <4613D93A.5020902-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 17:09                                                       ` Nakajima, Jun
     [not found]                                                         ` <8FFF7E42E93CC646B632AB40643802A8025B970D-1a9uaKK1+wJcIJlls4ac1rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-04 21:33                                                           ` Dor Laor
2007-04-04 18:12                                                   ` Anthony Liguori
2007-04-04 15:51                       ` Anthony Liguori
     [not found]                         ` <4613C993.9020405-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 16:02                           ` Avi Kivity
     [not found]                             ` <4613CC01.1090500-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 16:09                               ` Anthony Liguori
     [not found]                                 ` <4613CDB2.4000903-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 16:19                                   ` Avi Kivity
     [not found]                                     ` <4613D001.3040606-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-04 20:01                                       ` Ingo Molnar
     [not found]                                         ` <20070404200112.GA6070-X9Un+BFzKDI@public.gmane.org>
2007-04-04 20:24                                           ` Anthony Liguori
     [not found]                                             ` <4614098F.2030307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-04-04 21:21                                               ` Ingo Molnar
2007-04-04 23:19                                                 ` [kvm-devel] " Rusty Russell
2007-04-05  7:17                                                   ` Avi Kivity
2007-04-06  1:02                                                     ` Rusty Russell
2007-04-08  5:36                                                       ` Avi Kivity
     [not found]                                                         ` <46187F4E.1080807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-08  9:04                                                           ` Muli Ben-Yehuda
2007-04-09  2:50                                                           ` Rusty Russell
     [not found]                                                             ` <1176087018.11664.65.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-09  7:10                                                               ` Avi Kivity
     [not found]                                                                 ` <4619E6DC.3010804-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-09  9:46                                                                   ` Rusty Russell
     [not found]                                                                     ` <1176111984.11664.90.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-09 13:38                                                                       ` Avi Kivity
     [not found]                                                                         ` <461A41CA.9080201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-10  8:07                                                                           ` Evgeniy Polyakov
2007-04-10  8:19                                                                             ` [kvm-devel] " Avi Kivity
     [not found]                                                                               ` <461B48A8.1060904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-10  8:58                                                                                 ` Evgeniy Polyakov
2007-04-10 11:21                                                                                   ` [kvm-devel] " Avi Kivity
     [not found]                                                                                     ` <461B7334.8090807-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-10 11:50                                                                                       ` Evgeniy Polyakov
2007-04-10 12:17                                                                                         ` [kvm-devel] " Avi Kivity
     [not found]                                                                                           ` <461B8069.6070007-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-10 12:30                                                                                             ` Evgeniy Polyakov
     [not found]                                                                                               ` <20070410123034.GA11493-9fLWQ3dKdXwox3rIn2DAYQ@public.gmane.org>
2007-04-10 12:49                                                                                                 ` Avi Kivity
2007-04-11  3:53                                                                         ` [kvm-devel] " Rusty Russell
     [not found]                                                                           ` <1176263593.26372.84.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-11  4:26                                                                             ` Avi Kivity
     [not found]                                                                               ` <461C6360.1060908-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-11 13:23                                                                                 ` Rusty Russell
     [not found]                                                                                   ` <1176297794.14322.72.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-11 14:28                                                                                     ` Avi Kivity
     [not found]                                                                                       ` <461CF098.3090003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-11 23:30                                                                                         ` Rusty Russell
     [not found]                                                                                           ` <1176334200.14322.133.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-12  3:32                                                                                             ` Avi Kivity
2007-04-16  0:22                                                                                               ` [kvm-devel] " Rusty Russell
2007-04-16  5:13                                                                                                 ` Avi Kivity
     [not found]                                                   ` <1175728768.12230.593.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-04-05  9:30                                                     ` Ingo Molnar
     [not found]                                                       ` <20070405093033.GC25448-X9Un+BFzKDI@public.gmane.org>
2007-04-05  9:58                                                         ` Avi Kivity
2007-04-05 10:26                                                           ` [kvm-devel] " Ingo Molnar
2007-04-05 11:26                                                             ` Avi Kivity
     [not found]                                                               ` <4614DCE1.70905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-05 11:36                                                                 ` Ingo Molnar
2007-04-06  1:16                                                                   ` [kvm-devel] " Rusty Russell
2007-04-06 18:59                                                                     ` Ingo Molnar
2007-04-05 10:55                                                         ` Ingo Molnar
2007-04-05 14:32                                                       ` [kvm-devel] " Anthony Liguori
2007-04-06 10:37                                                         ` Ingo Molnar
2007-04-06 11:07                                                           ` Ingo Molnar
2007-04-04 22:07                                               ` Dor Laor
2007-04-05  6:54                                               ` Avi Kivity
2007-04-05  6:40                                           ` Avi Kivity
2007-04-04 16:23                                   ` Dor Laor
     [not found]                                     ` <64F9B87B6B770947A9F8391472E032160B318ED4-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
2007-04-04 16:48                                       ` Anthony Liguori
     [not found]                                         ` <4613D6CD.6060209-rdkfGonbjUSkNkDKm+mE6A@public.gmane.org>
2007-04-04 20:07                                           ` Ingo Molnar
2007-04-05 13:50                                       ` Dong, Eddie
     [not found]                                         ` <0E6FE5D295DE5B4B8D9070C26A227987E0D1EC-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-05 13:59                                           ` Avi Kivity
     [not found]                                             ` <461500B5.4080102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-05 14:52                                               ` Dong, Eddie
     [not found]                                                 ` <0E6FE5D295DE5B4B8D9070C26A227987F2444B-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-05 15:01                                                   ` Avi Kivity
     [not found]                                                     ` <46150F4F.4030505-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-05 15:16                                                       ` Dong, Eddie
     [not found]                                                         ` <0E6FE5D295DE5B4B8D9070C26A227987F24452-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-04-05 15:26                                                           ` Anthony Liguori
2007-04-05 15:22                                                       ` Anthony Liguori
2007-04-05 14:41                                           ` Dor Laor
     [not found]                                             ` <64F9B87B6B770947A9F8391472E032160B319509-yEcIvxbTEBqsx+V+t5oei8rau4O3wl8o3fe8/T/H7NteoWH0uzbU5w@public.gmane.org>
2007-04-05 14:48                                               ` Dong, Eddie
2007-04-05 14:57                                               ` Dong, Eddie
2007-04-04 17:51           ` Gregory Haskins
     [not found]             ` <46139F39.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-04 18:19               ` Anthony Liguori

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.