All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0 of 5] dma api v3
@ 2008-12-12 18:16 ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Here the lastest patchset of the dma api against qemu svn. It should build and
you've only to set the DEBUG_BOUNCE somehow when preadv/pwritev aren't
available at compile time. Then you know iovcnt will always be 1 and it'll
handle up to size_t amount of dma in flight with 1M buffer (default setting
but can be reduced to 512bytes as long as max alignment required by any
hardware driver is 512bytes).

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 0 of 5] dma api v3
@ 2008-12-12 18:16 ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Here the lastest patchset of the dma api against qemu svn. It should build and
you've only to set the DEBUG_BOUNCE somehow when preadv/pwritev aren't
available at compile time. Then you know iovcnt will always be 1 and it'll
handle up to size_t amount of dma in flight with 1M buffer (default setting
but can be reduced to 512bytes as long as max alignment required by any
hardware driver is 512bytes).

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [PATCH 1 of 5] fix cpu_physical_memory len
  2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:16   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

From: Andrea Arcangeli <aarcange@redhat.com>

Be consistent and have length be size_t for all methods.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/cpu-all.h b/cpu-all.h
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -911,14 +911,14 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 CPUReadMemoryFunc **cpu_get_io_memory_read(int io_index);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write);
+                            size_t len, int is_write);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
-                                            uint8_t *buf, int len)
+                                            uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, buf, len, 0);
 }
 static inline void cpu_physical_memory_write(target_phys_addr_t addr,
-                                             const uint8_t *buf, int len)
+                                             const uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
@@ -936,7 +936,7 @@ void cpu_physical_memory_write_rom(targe
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len);
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
-                        uint8_t *buf, int len, int is_write);
+                        uint8_t *buf, size_t len, int is_write);
 
 #define VGA_DIRTY_FLAG       0x01
 #define CODE_DIRTY_FLAG      0x02
diff --git a/exec.c b/exec.c
--- a/exec.c
+++ b/exec.c
@@ -2839,7 +2839,7 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, flags;
     target_ulong page;
@@ -2880,7 +2880,7 @@ void cpu_physical_memory_rw(target_phys_
 
 #else
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -3232,7 +3232,7 @@ void stq_phys(target_phys_addr_t addr, u
 
 /* virtual memory access for debug */
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
-                        uint8_t *buf, int len, int is_write)
+                        uint8_t *buf, size_t len, int is_write)
 {
     int l;
     target_phys_addr_t phys_addr;

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 1 of 5] fix cpu_physical_memory len
@ 2008-12-12 18:16   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

From: Andrea Arcangeli <aarcange@redhat.com>

Be consistent and have length be size_t for all methods.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/cpu-all.h b/cpu-all.h
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -911,14 +911,14 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 CPUReadMemoryFunc **cpu_get_io_memory_read(int io_index);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write);
+                            size_t len, int is_write);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
-                                            uint8_t *buf, int len)
+                                            uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, buf, len, 0);
 }
 static inline void cpu_physical_memory_write(target_phys_addr_t addr,
-                                             const uint8_t *buf, int len)
+                                             const uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
@@ -936,7 +936,7 @@ void cpu_physical_memory_write_rom(targe
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len);
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
-                        uint8_t *buf, int len, int is_write);
+                        uint8_t *buf, size_t len, int is_write);
 
 #define VGA_DIRTY_FLAG       0x01
 #define CODE_DIRTY_FLAG      0x02
diff --git a/exec.c b/exec.c
--- a/exec.c
+++ b/exec.c
@@ -2839,7 +2839,7 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, flags;
     target_ulong page;
@@ -2880,7 +2880,7 @@ void cpu_physical_memory_rw(target_phys_
 
 #else
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -3232,7 +3232,7 @@ void stq_phys(target_phys_addr_t addr, u
 
 /* virtual memory access for debug */
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
-                        uint8_t *buf, int len, int is_write)
+                        uint8_t *buf, size_t len, int is_write)
 {
     int l;
     target_phys_addr_t phys_addr;

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:16   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

From: Andrea Arcangeli <aarcange@redhat.com>

Add can_dma and post_dma methods needed before/after direct IO to guest
physical memory.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/cpu-all.h b/cpu-all.h
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -912,6 +912,11 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             size_t len, int is_write);
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                        size_t len);
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                     size_t len, int is_write,
+                                     int alignment);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
                                             uint8_t *buf, size_t len)
 {
diff --git a/exec.c b/exec.c
--- a/exec.c
+++ b/exec.c
@@ -2974,6 +2974,111 @@ void cpu_physical_memory_rw(target_phys_
     }
 }
 
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                     size_t len, int is_write,
+                                     int alignment)
+{
+    int l, first = 1;
+    uint8_t *ptr = NULL;
+    target_phys_addr_t page;
+    unsigned long pd, pd_first = 0;
+    PhysPageDesc *p;
+
+    if (len & (alignment-1))
+        goto out;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if (is_write) {
+            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+                return NULL;
+        } else {
+            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+                !(pd & IO_MEM_ROMD))
+                return NULL;
+        }
+
+        if (first) {
+            first = 0;
+            ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
+                (addr & ~TARGET_PAGE_MASK);
+            if ((unsigned long)ptr & (alignment-1))
+                return NULL;
+            pd_first = pd;
+        }
+
+        /* nonlinear range */
+        if (pd_first != pd)
+            return NULL;
+        pd_first += TARGET_PAGE_SIZE;
+
+        len -= l;
+        addr += l;
+    }
+
+out:
+    return ptr;
+}
+
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                        size_t len)
+{
+    int l;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    unsigned long pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+            printf("ERROR cpu_physical_memory_post_dma: memory layout changed\n");
+            continue;
+        } else {
+            unsigned long addr1;
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            /* RAM case */
+            ptr = phys_ram_base + addr1;
+            if (!cpu_physical_memory_is_dirty(addr1)) {
+                /* invalidate code */
+                tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+                /* set dirty bit */
+                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
+                    (0xff & ~CODE_DIRTY_FLAG);
+            }
+            /* qemu doesn't execute guest code directly, but kvm does
+               therefore fluch instruction caches */
+            if (kvm_enabled())
+                flush_icache_range((unsigned long)ptr,
+                                   ((unsigned long)ptr)+l);
+        }
+        len -= l;
+        addr += l;
+    }
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 18:16   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

From: Andrea Arcangeli <aarcange@redhat.com>

Add can_dma and post_dma methods needed before/after direct IO to guest
physical memory.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/cpu-all.h b/cpu-all.h
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -912,6 +912,11 @@ CPUReadMemoryFunc **cpu_get_io_memory_re
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             size_t len, int is_write);
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                        size_t len);
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                     size_t len, int is_write,
+                                     int alignment);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
                                             uint8_t *buf, size_t len)
 {
diff --git a/exec.c b/exec.c
--- a/exec.c
+++ b/exec.c
@@ -2974,6 +2974,111 @@ void cpu_physical_memory_rw(target_phys_
     }
 }
 
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                     size_t len, int is_write,
+                                     int alignment)
+{
+    int l, first = 1;
+    uint8_t *ptr = NULL;
+    target_phys_addr_t page;
+    unsigned long pd, pd_first = 0;
+    PhysPageDesc *p;
+
+    if (len & (alignment-1))
+        goto out;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if (is_write) {
+            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+                return NULL;
+        } else {
+            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+                !(pd & IO_MEM_ROMD))
+                return NULL;
+        }
+
+        if (first) {
+            first = 0;
+            ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
+                (addr & ~TARGET_PAGE_MASK);
+            if ((unsigned long)ptr & (alignment-1))
+                return NULL;
+            pd_first = pd;
+        }
+
+        /* nonlinear range */
+        if (pd_first != pd)
+            return NULL;
+        pd_first += TARGET_PAGE_SIZE;
+
+        len -= l;
+        addr += l;
+    }
+
+out:
+    return ptr;
+}
+
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                        size_t len)
+{
+    int l;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    unsigned long pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+            printf("ERROR cpu_physical_memory_post_dma: memory layout changed\n");
+            continue;
+        } else {
+            unsigned long addr1;
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            /* RAM case */
+            ptr = phys_ram_base + addr1;
+            if (!cpu_physical_memory_is_dirty(addr1)) {
+                /* invalidate code */
+                tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+                /* set dirty bit */
+                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
+                    (0xff & ~CODE_DIRTY_FLAG);
+            }
+            /* qemu doesn't execute guest code directly, but kvm does
+               therefore fluch instruction caches */
+            if (kvm_enabled())
+                flush_icache_range((unsigned long)ptr,
+                                   ((unsigned long)ptr)+l);
+        }
+        len -= l;
+        addr += l;
+    }
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [PATCH 3 of 5] rename dma.c to isa_dma.c
  2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:16   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

From: Andrea Arcangeli <aarcange@redhat.com>

Move dma.c to isa_dma.c.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/Makefile.target b/Makefile.target
--- a/Makefile.target
+++ b/Makefile.target
@@ -629,7 +629,7 @@ OBJS += e1000.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -640,7 +640,7 @@ ifeq ($(TARGET_BASE_ARCH), ppc)
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -657,7 +657,7 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), mips)
 OBJS+= mips_r4k.o mips_jazz.o mips_malta.o mips_mipssim.o
-OBJS+= mips_timer.o mips_int.o dma.o vga.o serial.o i8254.o i8259.o rc4030.o
+OBJS+= mips_timer.o mips_int.o isa_dma.o vga.o serial.o i8254.o i8259.o rc4030.o
 OBJS+= g364fb.o jazz_led.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o ds1225y.o
 OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
diff --git a/hw/dma.c b/hw/isa_dma.c
rename from hw/dma.c
rename to hw/isa_dma.c

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 3 of 5] rename dma.c to isa_dma.c
@ 2008-12-12 18:16   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

From: Andrea Arcangeli <aarcange@redhat.com>

Move dma.c to isa_dma.c.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/Makefile.target b/Makefile.target
--- a/Makefile.target
+++ b/Makefile.target
@@ -629,7 +629,7 @@ OBJS += e1000.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -640,7 +640,7 @@ ifeq ($(TARGET_BASE_ARCH), ppc)
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -657,7 +657,7 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), mips)
 OBJS+= mips_r4k.o mips_jazz.o mips_malta.o mips_mipssim.o
-OBJS+= mips_timer.o mips_int.o dma.o vga.o serial.o i8254.o i8259.o rc4030.o
+OBJS+= mips_timer.o mips_int.o isa_dma.o vga.o serial.o i8254.o i8259.o rc4030.o
 OBJS+= g364fb.o jazz_led.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o ds1225y.o
 OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
diff --git a/hw/dma.c b/hw/isa_dma.c
rename from hw/dma.c
rename to hw/isa_dma.c

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [PATCH 4 of 5] dma api
  2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:16   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

From: Andrea Arcangeli <aarcange@redhat.com>

One major limitation for KVM today is the lack of a proper way to write drivers
in a way that allows the host OS to use direct DMA to the guest physical memory
to avoid any intermediate copy. The only API provided to drivers seems to be
the cpu_physical_memory_rw and that enforces all drivers to bounce and trash
cpu caches and be memory bound. This new DMA API instead allows drivers to use
a pci_dma_sg method for SG I/O that will translate the guest physical addresses
to host virutal addresses and it will call two operation, one is a submit
method and one is the complete method. The pci_dma_sg may have to bounce buffer
internally and to limit the max bounce size it may have to submit I/O in pieces
with multiple submit calls.

All we care about is the performance of the direct path, so I tried to
avoid dynamic allocations there to avoid entering glibc.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/Makefile.target b/Makefile.target
--- a/Makefile.target
+++ b/Makefile.target
@@ -629,7 +629,7 @@ OBJS += e1000.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -640,7 +640,7 @@ ifeq ($(TARGET_BASE_ARCH), ppc)
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o dma.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -658,7 +658,7 @@ ifeq ($(TARGET_BASE_ARCH), mips)
 ifeq ($(TARGET_BASE_ARCH), mips)
 OBJS+= mips_r4k.o mips_jazz.o mips_malta.o mips_mipssim.o
 OBJS+= mips_timer.o mips_int.o isa_dma.o vga.o serial.o i8254.o i8259.o rc4030.o
-OBJS+= g364fb.o jazz_led.o
+OBJS+= g364fb.o jazz_led.o dma.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o ds1225y.o
 OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
 OBJS+= mipsnet.o
@@ -667,7 +667,7 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), cris)
 OBJS+= etraxfs.o
-OBJS+= etraxfs_dma.o
+OBJS+= etraxfs_dma.o dma.o
 OBJS+= etraxfs_pic.o
 OBJS+= etraxfs_eth.o
 OBJS+= etraxfs_timer.o
@@ -678,13 +678,13 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), sparc)
 ifeq ($(TARGET_ARCH), sparc64)
-OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o
+OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o m48t59.o
 OBJS+= cirrus_vga.o parallel.o ptimer.o
 else
 OBJS+= sun4m.o tcx.o pcnet.o iommu.o m48t59.o slavio_intctl.o
 OBJS+= slavio_timer.o slavio_serial.o slavio_misc.o fdc.o sparc32_dma.o
-OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o
+OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o dma.o
 endif
 endif
 ifeq ($(TARGET_BASE_ARCH), arm)
@@ -700,7 +700,7 @@ OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= zaurus.o ide.o serial.o nand.o ecc.o spitz.o tosa.o tc6393xb.o
 OBJS+= omap1.o omap_lcdc.o omap_dma.o omap_clk.o omap_mmc.o omap_i2c.o
-OBJS+= omap2.o omap_dss.o soc_dma.o
+OBJS+= omap2.o omap_dss.o soc_dma.o dma.o
 OBJS+= palm.o tsc210x.o
 OBJS+= nseries.o blizzard.o onenand.o vga.o cbus.o tusb6010.o usb-musb.o
 OBJS+= tsc2005.o bt-hci-csr.o
@@ -711,11 +711,11 @@ ifeq ($(TARGET_BASE_ARCH), sh4)
 ifeq ($(TARGET_BASE_ARCH), sh4)
 OBJS+= shix.o r2d.o sh7750.o sh7750_regnames.o tc58128.o
 OBJS+= sh_timer.o ptimer.o sh_serial.o sh_intc.o sh_pci.o sm501.o serial.o
-OBJS+= ide.o
+OBJS+= ide.o dma.o
 endif
 ifeq ($(TARGET_BASE_ARCH), m68k)
 OBJS+= an5206.o mcf5206.o ptimer.o mcf_uart.o mcf_intc.o mcf5208.o mcf_fec.o
-OBJS+= m68k-semi.o dummy_m68k.o
+OBJS+= m68k-semi.o dummy_m68k.o dma.o
 endif
 ifdef CONFIG_GDBSTUB
 OBJS+=gdbstub.o gdbstub-xml.o
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -2,6 +2,7 @@
 #define BLOCK_H
 
 #include "qemu-aio.h"
+#include <sys/uio.h> /* struct iovec */
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
diff --git a/hw/dma.c b/hw/dma.c
new file mode 100644
--- /dev/null
+++ b/hw/dma.c
@@ -0,0 +1,366 @@
+/*
+ * QEMU PCI DMA operations
+ *
+ * Copyright (c) 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include "dma.h"
+
+#define MAX_DMA_BOUNCE_BUFFER (1024*1024)
+//#define DEBUG_BOUNCE
+//#define MAX_DMA_BOUNCE_BUFFER (512)
+//#define DISABLE_IOVEC_CACHE
+
+typedef struct QEMUPciDmaSgParam {
+    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
+    QEMUPciDmaSgComplete *pci_dma_sg_complete;
+    void *pci_dma_sg_opaque;
+    int dma_to_memory;
+    int alignment;
+    uint8_t *bounce;
+    QEMUPciDmaSg *sg;
+    int iovcnt;
+    int restart_iovcnt;
+    size_t restart_offset;
+    int curr_restart_iovcnt;
+    size_t curr_restart_offset;
+    size_t curr_len;
+#ifndef DISABLE_IOVEC_CACHE
+    struct QEMUPciDmaSgParam *next;
+#endif
+    struct iovec iov;
+} QEMUPciDmaSgParam;
+
+#ifndef DISABLE_IOVEC_CACHE
+/*
+ * Too many entries will run slower and waste memory, this is really
+ * only about the fast path so it must be small, slow path is fine to
+ * allocate dynamically. Max memory used is slightly more than
+ * MAX_IOVEC_ENTRIES * MAX_IOVEC_IOVCNT * sizeof(struct iovec).
+ */
+#define MAX_IOVEC_ENTRIES 10
+
+/*
+ * Don't cache exceptionally large iovcnt used for huge DMA transfers
+ * as the DMA transfer may take much longer than malloc and huge
+ * memory could be wasted if it happens only once in a while.
+ */
+#define MAX_IOVEC_IOVCNT 2048
+
+static QEMUPciDmaSgParam *sg_list;
+static long max_sg_in_flight, sg_in_flight;
+
+static QEMUPciDmaSgParam *qemu_get_pci_dma_sg(int iovcnt)
+{
+    QEMUPciDmaSgParam *entry = sg_list, *last;
+
+    while (entry) {
+        if (entry->iovcnt >= iovcnt) {
+            if (entry == sg_list)
+                sg_list = sg_list->next;
+            else
+                last->next = entry->next;
+            goto found;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    entry = qemu_malloc(sizeof(QEMUPciDmaSgParam) +
+                        sizeof(struct iovec) * (iovcnt-1));
+    if (!entry)
+        return NULL;
+
+    if (iovcnt <= MAX_IOVEC_IOVCNT) {
+    found:
+        sg_in_flight += 1;
+        if (sg_in_flight > max_sg_in_flight)
+            max_sg_in_flight = sg_in_flight;
+    }
+    return entry;
+}
+
+static void qemu_release_pci_dma_sg(QEMUPciDmaSgParam *this)
+{
+    QEMUPciDmaSgParam *min_entry = NULL, *entry = sg_list;
+    QEMUPciDmaSgParam *min_last = NULL, *last = NULL;
+    unsigned int min_iovcnt = -1;
+    int nr = 0, tot;
+
+    if (this->iovcnt > MAX_IOVEC_IOVCNT) {
+        qemu_free(this);
+        return;
+    }
+
+    while (entry) {
+        nr += 1;
+        if ((unsigned int)entry->iovcnt <= min_iovcnt) {
+            min_entry = entry;
+            min_last = last;
+            min_iovcnt = entry->iovcnt;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    assert(max_sg_in_flight > 0);
+    assert(sg_in_flight > 0);
+    tot = nr+sg_in_flight; 
+    if (tot > max_sg_in_flight || tot > MAX_IOVEC_ENTRIES) {
+        /* detail: replace even if it's equal as it's cache hot */
+        if ((unsigned int)this->iovcnt < min_iovcnt)
+            qemu_free(this);
+        else {
+            assert(nr > 0);
+            if (min_entry == sg_list) {
+                this->next = sg_list->next;
+            } else {
+                min_last->next = min_entry->next;
+                this->next = sg_list;
+            }
+            sg_list = this;
+            qemu_free(min_entry);
+        }
+    } else {
+        this->next = sg_list;
+        sg_list = this;
+    }
+    sg_in_flight -= 1;
+    assert(sg_in_flight >= 0);
+}
+#else /* DISABLE_IOVEC_CACHE */
+#define qemu_get_pci_dma_sg(iovcnt) qemu_malloc(sizeof(QEMUPciDmaSgParam)+(sizeof(struct iovec)*((iovcnt)-1))) 
+#define qemu_release_pci_dma_sg(param) qemu_free(param)
+#endif /* DISABLE_IOVEC_CACHE */
+
+static int pci_dma_sg_map_direct(QEMUPciDmaSg *sg,
+                                 int iovcnt,
+                                 int dma_to_memory,
+                                 int alignment,
+                                 size_t *len,
+                                 struct iovec *dma_iov)
+{
+    int idx = 0;
+    size_t _len = 0;
+
+#ifdef DEBUG_BOUNCE
+    return 0;
+#endif
+
+    for (idx = 0; idx < iovcnt; idx++) {
+        void * addr;
+
+        if (_len + sg[idx].len <= _len)
+            return 0;
+        _len += sg[idx].len;
+
+        addr = cpu_physical_memory_can_dma(sg[idx].addr,
+                                           sg[idx].len,
+                                           dma_to_memory,
+                                           alignment);
+        if (!addr)
+            return 0;
+
+        dma_iov[idx].iov_base = addr;
+        dma_iov[idx].iov_len = sg[idx].len;
+    }
+
+    *len = _len;
+    return 1;
+}
+
+static int pci_dma_sg_map_bounce(QEMUPciDmaSgParam *param)
+{
+    int idx;
+    size_t len = 0;
+
+    param->curr_restart_iovcnt = param->restart_iovcnt;
+    param->curr_restart_offset = param->restart_offset;
+
+    for (idx = param->restart_iovcnt; idx < param->iovcnt; idx++) {
+        if (len + param->sg[idx].len <= len)
+            return 0;
+        len += param->sg[idx].len - param->restart_offset;
+        param->restart_offset = 0;
+        if (len > MAX_DMA_BOUNCE_BUFFER) {
+            size_t leftover = len - MAX_DMA_BOUNCE_BUFFER;
+            param->restart_offset = param->sg[idx].len - leftover;
+            len = MAX_DMA_BOUNCE_BUFFER;
+            break;
+        }
+    }
+    param->restart_iovcnt = idx;
+    param->curr_len = len;
+
+    if (len & (param->alignment-1))
+        return 0;
+
+    param->iov.iov_len = len;
+    if (!param->bounce) {
+        param->bounce = qemu_memalign(param->alignment, len);
+        if (!param->bounce)
+            return 0;
+        param->iov.iov_base = param->bounce;
+    }
+
+    if (!param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > len)
+                copy_len = len;
+            cpu_physical_memory_read(param->sg[idx].addr + 
+                                     param->curr_restart_offset,
+                                     param->bounce + offset,
+                                     copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+
+    return 1;
+}
+
+static void pci_dma_sg_unmap_direct(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        QEMUPciDmaSg *sg = param->sg;
+        for (idx = 0; idx < param->iovcnt; idx++)
+            cpu_physical_memory_write_post_dma(sg[idx].addr,
+                                               sg[idx].len);
+    }
+}
+
+static int pci_dma_sg_unmap_bounce(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < param->curr_len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > param->curr_len)
+                copy_len = param->curr_len;
+            cpu_physical_memory_write(param->sg[idx].addr +
+                                      param->curr_restart_offset,
+                                      param->bounce + offset,
+                                      copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+    if (param->restart_iovcnt == param->iovcnt || ret) {
+        qemu_free(param->bounce);
+        return 0;
+    }
+    return 1;
+}
+
+static void pci_dma_sg_cb(void *opaque, int ret)
+{
+    QEMUPciDmaSgParam *param = opaque;
+    int restart = 0;
+
+    if (!param->bounce)
+        pci_dma_sg_unmap_direct(param, ret);
+    else
+        restart = pci_dma_sg_unmap_bounce(param, ret);
+
+    if (restart) {
+        ret = -1000;
+        if (!pci_dma_sg_map_bounce(param)) {
+            qemu_free(param->bounce);
+            goto out_free;
+        }
+        ret = param->pci_dma_sg_submit(param->pci_dma_sg_opaque,
+                                       &param->iov, 1,
+                                       param->curr_len,
+                                       pci_dma_sg_cb,
+                                       param);
+    }
+    if (ret || !restart) {
+    out_free:
+        param->pci_dma_sg_complete(param->pci_dma_sg_opaque, ret);
+        qemu_release_pci_dma_sg(param);
+    }
+}
+
+/* PCIDevice is there in case we want to emulate an iommu later */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit pci_dma_sg_submit,
+                QEMUPciDmaSgComplete pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment)
+{
+    int ret;
+    QEMUPciDmaSgParam *param;
+
+    ret = -2000;
+    if ((unsigned int) dma_to_memory > 1)
+        goto err;
+    if ((unsigned int) alignment > MAX_DMA_BOUNCE_BUFFER)
+        goto err;
+    if (iovcnt < 1)
+        goto err;
+
+    param = qemu_get_pci_dma_sg(iovcnt);
+    if (!param)
+        goto err;
+
+    param->pci_dma_sg_submit = pci_dma_sg_submit;
+    param->pci_dma_sg_complete = pci_dma_sg_complete;
+    param->pci_dma_sg_opaque = pci_dma_sg_opaque;
+    param->dma_to_memory = dma_to_memory;
+    param->alignment = alignment;
+    param->bounce = NULL;
+    param->sg = sg;
+    param->iovcnt = iovcnt;
+    param->restart_offset = param->restart_iovcnt = 0;
+
+    /* map the sg */
+    if (!pci_dma_sg_map_direct(sg, iovcnt,
+                               dma_to_memory, alignment,
+                               &param->curr_len, &param->iov)) {
+        ret = -2004;
+        if (!pci_dma_sg_map_bounce(param))
+            goto out_free;
+        iovcnt = 1;
+    }
+
+    /* run the I/O */
+    ret = pci_dma_sg_submit(pci_dma_sg_opaque,
+                            &param->iov, iovcnt, param->curr_len,
+                            pci_dma_sg_cb,
+                            param);
+    if (ret)
+    out_free:
+        pci_dma_sg_cb(param, ret);
+    return;
+
+err:
+    pci_dma_sg_complete(pci_dma_sg_opaque, ret);
+    return;
+}
diff --git a/hw/dma.h b/hw/dma.h
new file mode 100644
--- /dev/null
+++ b/hw/dma.h
@@ -0,0 +1,28 @@
+#ifndef QEMU_PCI_DMA_H
+#define QEMU_PCI_DMA_H
+
+#include "qemu-common.h"
+#include "block.h"
+
+typedef int QEMUPciDmaSgSubmit(void *pci_dma_sg_opaque,
+                               struct iovec *iov, int iovcnt,
+                               size_t len,
+                               BlockDriverCompletionFunc dma_cb,
+                               void *dma_cb_param);
+
+typedef void QEMUPciDmaSgComplete(void *pci_dma_sg_opaque, int ret);
+
+typedef struct QEMUPciDmaSg {
+    target_phys_addr_t addr;
+    size_t len;
+} QEMUPciDmaSg;
+
+/* pci_dma.c */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit *pci_dma_sg_submit,
+                QEMUPciDmaSgComplete *pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment);
+
+#endif

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 4 of 5] dma api
@ 2008-12-12 18:16   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

From: Andrea Arcangeli <aarcange@redhat.com>

One major limitation for KVM today is the lack of a proper way to write drivers
in a way that allows the host OS to use direct DMA to the guest physical memory
to avoid any intermediate copy. The only API provided to drivers seems to be
the cpu_physical_memory_rw and that enforces all drivers to bounce and trash
cpu caches and be memory bound. This new DMA API instead allows drivers to use
a pci_dma_sg method for SG I/O that will translate the guest physical addresses
to host virutal addresses and it will call two operation, one is a submit
method and one is the complete method. The pci_dma_sg may have to bounce buffer
internally and to limit the max bounce size it may have to submit I/O in pieces
with multiple submit calls.

All we care about is the performance of the direct path, so I tried to
avoid dynamic allocations there to avoid entering glibc.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/Makefile.target b/Makefile.target
--- a/Makefile.target
+++ b/Makefile.target
@@ -629,7 +629,7 @@ OBJS += e1000.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -640,7 +640,7 @@ ifeq ($(TARGET_BASE_ARCH), ppc)
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o dma.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -658,7 +658,7 @@ ifeq ($(TARGET_BASE_ARCH), mips)
 ifeq ($(TARGET_BASE_ARCH), mips)
 OBJS+= mips_r4k.o mips_jazz.o mips_malta.o mips_mipssim.o
 OBJS+= mips_timer.o mips_int.o isa_dma.o vga.o serial.o i8254.o i8259.o rc4030.o
-OBJS+= g364fb.o jazz_led.o
+OBJS+= g364fb.o jazz_led.o dma.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o ds1225y.o
 OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
 OBJS+= mipsnet.o
@@ -667,7 +667,7 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), cris)
 OBJS+= etraxfs.o
-OBJS+= etraxfs_dma.o
+OBJS+= etraxfs_dma.o dma.o
 OBJS+= etraxfs_pic.o
 OBJS+= etraxfs_eth.o
 OBJS+= etraxfs_timer.o
@@ -678,13 +678,13 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), sparc)
 ifeq ($(TARGET_ARCH), sparc64)
-OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o
+OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o m48t59.o
 OBJS+= cirrus_vga.o parallel.o ptimer.o
 else
 OBJS+= sun4m.o tcx.o pcnet.o iommu.o m48t59.o slavio_intctl.o
 OBJS+= slavio_timer.o slavio_serial.o slavio_misc.o fdc.o sparc32_dma.o
-OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o
+OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o dma.o
 endif
 endif
 ifeq ($(TARGET_BASE_ARCH), arm)
@@ -700,7 +700,7 @@ OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= zaurus.o ide.o serial.o nand.o ecc.o spitz.o tosa.o tc6393xb.o
 OBJS+= omap1.o omap_lcdc.o omap_dma.o omap_clk.o omap_mmc.o omap_i2c.o
-OBJS+= omap2.o omap_dss.o soc_dma.o
+OBJS+= omap2.o omap_dss.o soc_dma.o dma.o
 OBJS+= palm.o tsc210x.o
 OBJS+= nseries.o blizzard.o onenand.o vga.o cbus.o tusb6010.o usb-musb.o
 OBJS+= tsc2005.o bt-hci-csr.o
@@ -711,11 +711,11 @@ ifeq ($(TARGET_BASE_ARCH), sh4)
 ifeq ($(TARGET_BASE_ARCH), sh4)
 OBJS+= shix.o r2d.o sh7750.o sh7750_regnames.o tc58128.o
 OBJS+= sh_timer.o ptimer.o sh_serial.o sh_intc.o sh_pci.o sm501.o serial.o
-OBJS+= ide.o
+OBJS+= ide.o dma.o
 endif
 ifeq ($(TARGET_BASE_ARCH), m68k)
 OBJS+= an5206.o mcf5206.o ptimer.o mcf_uart.o mcf_intc.o mcf5208.o mcf_fec.o
-OBJS+= m68k-semi.o dummy_m68k.o
+OBJS+= m68k-semi.o dummy_m68k.o dma.o
 endif
 ifdef CONFIG_GDBSTUB
 OBJS+=gdbstub.o gdbstub-xml.o
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -2,6 +2,7 @@
 #define BLOCK_H
 
 #include "qemu-aio.h"
+#include <sys/uio.h> /* struct iovec */
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
diff --git a/hw/dma.c b/hw/dma.c
new file mode 100644
--- /dev/null
+++ b/hw/dma.c
@@ -0,0 +1,366 @@
+/*
+ * QEMU PCI DMA operations
+ *
+ * Copyright (c) 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include "dma.h"
+
+#define MAX_DMA_BOUNCE_BUFFER (1024*1024)
+//#define DEBUG_BOUNCE
+//#define MAX_DMA_BOUNCE_BUFFER (512)
+//#define DISABLE_IOVEC_CACHE
+
+typedef struct QEMUPciDmaSgParam {
+    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
+    QEMUPciDmaSgComplete *pci_dma_sg_complete;
+    void *pci_dma_sg_opaque;
+    int dma_to_memory;
+    int alignment;
+    uint8_t *bounce;
+    QEMUPciDmaSg *sg;
+    int iovcnt;
+    int restart_iovcnt;
+    size_t restart_offset;
+    int curr_restart_iovcnt;
+    size_t curr_restart_offset;
+    size_t curr_len;
+#ifndef DISABLE_IOVEC_CACHE
+    struct QEMUPciDmaSgParam *next;
+#endif
+    struct iovec iov;
+} QEMUPciDmaSgParam;
+
+#ifndef DISABLE_IOVEC_CACHE
+/*
+ * Too many entries will run slower and waste memory, this is really
+ * only about the fast path so it must be small, slow path is fine to
+ * allocate dynamically. Max memory used is slightly more than
+ * MAX_IOVEC_ENTRIES * MAX_IOVEC_IOVCNT * sizeof(struct iovec).
+ */
+#define MAX_IOVEC_ENTRIES 10
+
+/*
+ * Don't cache exceptionally large iovcnt used for huge DMA transfers
+ * as the DMA transfer may take much longer than malloc and huge
+ * memory could be wasted if it happens only once in a while.
+ */
+#define MAX_IOVEC_IOVCNT 2048
+
+static QEMUPciDmaSgParam *sg_list;
+static long max_sg_in_flight, sg_in_flight;
+
+static QEMUPciDmaSgParam *qemu_get_pci_dma_sg(int iovcnt)
+{
+    QEMUPciDmaSgParam *entry = sg_list, *last;
+
+    while (entry) {
+        if (entry->iovcnt >= iovcnt) {
+            if (entry == sg_list)
+                sg_list = sg_list->next;
+            else
+                last->next = entry->next;
+            goto found;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    entry = qemu_malloc(sizeof(QEMUPciDmaSgParam) +
+                        sizeof(struct iovec) * (iovcnt-1));
+    if (!entry)
+        return NULL;
+
+    if (iovcnt <= MAX_IOVEC_IOVCNT) {
+    found:
+        sg_in_flight += 1;
+        if (sg_in_flight > max_sg_in_flight)
+            max_sg_in_flight = sg_in_flight;
+    }
+    return entry;
+}
+
+static void qemu_release_pci_dma_sg(QEMUPciDmaSgParam *this)
+{
+    QEMUPciDmaSgParam *min_entry = NULL, *entry = sg_list;
+    QEMUPciDmaSgParam *min_last = NULL, *last = NULL;
+    unsigned int min_iovcnt = -1;
+    int nr = 0, tot;
+
+    if (this->iovcnt > MAX_IOVEC_IOVCNT) {
+        qemu_free(this);
+        return;
+    }
+
+    while (entry) {
+        nr += 1;
+        if ((unsigned int)entry->iovcnt <= min_iovcnt) {
+            min_entry = entry;
+            min_last = last;
+            min_iovcnt = entry->iovcnt;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    assert(max_sg_in_flight > 0);
+    assert(sg_in_flight > 0);
+    tot = nr+sg_in_flight; 
+    if (tot > max_sg_in_flight || tot > MAX_IOVEC_ENTRIES) {
+        /* detail: replace even if it's equal as it's cache hot */
+        if ((unsigned int)this->iovcnt < min_iovcnt)
+            qemu_free(this);
+        else {
+            assert(nr > 0);
+            if (min_entry == sg_list) {
+                this->next = sg_list->next;
+            } else {
+                min_last->next = min_entry->next;
+                this->next = sg_list;
+            }
+            sg_list = this;
+            qemu_free(min_entry);
+        }
+    } else {
+        this->next = sg_list;
+        sg_list = this;
+    }
+    sg_in_flight -= 1;
+    assert(sg_in_flight >= 0);
+}
+#else /* DISABLE_IOVEC_CACHE */
+#define qemu_get_pci_dma_sg(iovcnt) qemu_malloc(sizeof(QEMUPciDmaSgParam)+(sizeof(struct iovec)*((iovcnt)-1))) 
+#define qemu_release_pci_dma_sg(param) qemu_free(param)
+#endif /* DISABLE_IOVEC_CACHE */
+
+static int pci_dma_sg_map_direct(QEMUPciDmaSg *sg,
+                                 int iovcnt,
+                                 int dma_to_memory,
+                                 int alignment,
+                                 size_t *len,
+                                 struct iovec *dma_iov)
+{
+    int idx = 0;
+    size_t _len = 0;
+
+#ifdef DEBUG_BOUNCE
+    return 0;
+#endif
+
+    for (idx = 0; idx < iovcnt; idx++) {
+        void * addr;
+
+        if (_len + sg[idx].len <= _len)
+            return 0;
+        _len += sg[idx].len;
+
+        addr = cpu_physical_memory_can_dma(sg[idx].addr,
+                                           sg[idx].len,
+                                           dma_to_memory,
+                                           alignment);
+        if (!addr)
+            return 0;
+
+        dma_iov[idx].iov_base = addr;
+        dma_iov[idx].iov_len = sg[idx].len;
+    }
+
+    *len = _len;
+    return 1;
+}
+
+static int pci_dma_sg_map_bounce(QEMUPciDmaSgParam *param)
+{
+    int idx;
+    size_t len = 0;
+
+    param->curr_restart_iovcnt = param->restart_iovcnt;
+    param->curr_restart_offset = param->restart_offset;
+
+    for (idx = param->restart_iovcnt; idx < param->iovcnt; idx++) {
+        if (len + param->sg[idx].len <= len)
+            return 0;
+        len += param->sg[idx].len - param->restart_offset;
+        param->restart_offset = 0;
+        if (len > MAX_DMA_BOUNCE_BUFFER) {
+            size_t leftover = len - MAX_DMA_BOUNCE_BUFFER;
+            param->restart_offset = param->sg[idx].len - leftover;
+            len = MAX_DMA_BOUNCE_BUFFER;
+            break;
+        }
+    }
+    param->restart_iovcnt = idx;
+    param->curr_len = len;
+
+    if (len & (param->alignment-1))
+        return 0;
+
+    param->iov.iov_len = len;
+    if (!param->bounce) {
+        param->bounce = qemu_memalign(param->alignment, len);
+        if (!param->bounce)
+            return 0;
+        param->iov.iov_base = param->bounce;
+    }
+
+    if (!param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > len)
+                copy_len = len;
+            cpu_physical_memory_read(param->sg[idx].addr + 
+                                     param->curr_restart_offset,
+                                     param->bounce + offset,
+                                     copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+
+    return 1;
+}
+
+static void pci_dma_sg_unmap_direct(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        QEMUPciDmaSg *sg = param->sg;
+        for (idx = 0; idx < param->iovcnt; idx++)
+            cpu_physical_memory_write_post_dma(sg[idx].addr,
+                                               sg[idx].len);
+    }
+}
+
+static int pci_dma_sg_unmap_bounce(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < param->curr_len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > param->curr_len)
+                copy_len = param->curr_len;
+            cpu_physical_memory_write(param->sg[idx].addr +
+                                      param->curr_restart_offset,
+                                      param->bounce + offset,
+                                      copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+    if (param->restart_iovcnt == param->iovcnt || ret) {
+        qemu_free(param->bounce);
+        return 0;
+    }
+    return 1;
+}
+
+static void pci_dma_sg_cb(void *opaque, int ret)
+{
+    QEMUPciDmaSgParam *param = opaque;
+    int restart = 0;
+
+    if (!param->bounce)
+        pci_dma_sg_unmap_direct(param, ret);
+    else
+        restart = pci_dma_sg_unmap_bounce(param, ret);
+
+    if (restart) {
+        ret = -1000;
+        if (!pci_dma_sg_map_bounce(param)) {
+            qemu_free(param->bounce);
+            goto out_free;
+        }
+        ret = param->pci_dma_sg_submit(param->pci_dma_sg_opaque,
+                                       &param->iov, 1,
+                                       param->curr_len,
+                                       pci_dma_sg_cb,
+                                       param);
+    }
+    if (ret || !restart) {
+    out_free:
+        param->pci_dma_sg_complete(param->pci_dma_sg_opaque, ret);
+        qemu_release_pci_dma_sg(param);
+    }
+}
+
+/* PCIDevice is there in case we want to emulate an iommu later */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit pci_dma_sg_submit,
+                QEMUPciDmaSgComplete pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment)
+{
+    int ret;
+    QEMUPciDmaSgParam *param;
+
+    ret = -2000;
+    if ((unsigned int) dma_to_memory > 1)
+        goto err;
+    if ((unsigned int) alignment > MAX_DMA_BOUNCE_BUFFER)
+        goto err;
+    if (iovcnt < 1)
+        goto err;
+
+    param = qemu_get_pci_dma_sg(iovcnt);
+    if (!param)
+        goto err;
+
+    param->pci_dma_sg_submit = pci_dma_sg_submit;
+    param->pci_dma_sg_complete = pci_dma_sg_complete;
+    param->pci_dma_sg_opaque = pci_dma_sg_opaque;
+    param->dma_to_memory = dma_to_memory;
+    param->alignment = alignment;
+    param->bounce = NULL;
+    param->sg = sg;
+    param->iovcnt = iovcnt;
+    param->restart_offset = param->restart_iovcnt = 0;
+
+    /* map the sg */
+    if (!pci_dma_sg_map_direct(sg, iovcnt,
+                               dma_to_memory, alignment,
+                               &param->curr_len, &param->iov)) {
+        ret = -2004;
+        if (!pci_dma_sg_map_bounce(param))
+            goto out_free;
+        iovcnt = 1;
+    }
+
+    /* run the I/O */
+    ret = pci_dma_sg_submit(pci_dma_sg_opaque,
+                            &param->iov, iovcnt, param->curr_len,
+                            pci_dma_sg_cb,
+                            param);
+    if (ret)
+    out_free:
+        pci_dma_sg_cb(param, ret);
+    return;
+
+err:
+    pci_dma_sg_complete(pci_dma_sg_opaque, ret);
+    return;
+}
diff --git a/hw/dma.h b/hw/dma.h
new file mode 100644
--- /dev/null
+++ b/hw/dma.h
@@ -0,0 +1,28 @@
+#ifndef QEMU_PCI_DMA_H
+#define QEMU_PCI_DMA_H
+
+#include "qemu-common.h"
+#include "block.h"
+
+typedef int QEMUPciDmaSgSubmit(void *pci_dma_sg_opaque,
+                               struct iovec *iov, int iovcnt,
+                               size_t len,
+                               BlockDriverCompletionFunc dma_cb,
+                               void *dma_cb_param);
+
+typedef void QEMUPciDmaSgComplete(void *pci_dma_sg_opaque, int ret);
+
+typedef struct QEMUPciDmaSg {
+    target_phys_addr_t addr;
+    size_t len;
+} QEMUPciDmaSg;
+
+/* pci_dma.c */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit *pci_dma_sg_submit,
+                QEMUPciDmaSgComplete *pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment);
+
+#endif

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [PATCH 5 of 5] bdrv_aio_readv/writev
  2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:16   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

From: Andrea Arcangeli <aarcange@redhat.com>

bdrv_aio_readv/writev methods (depends on lower level bdrv_aio_readv).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/block.c b/block.c
--- a/block.c
+++ b/block.c
@@ -1296,6 +1296,50 @@ void bdrv_aio_cancel(BlockDriverAIOCB *a
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+				 struct iovec *iov, int iovcnt, size_t len,
+				 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_readv(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+	/* Update stats even though technically transfer has not happened. */
+	bs->rd_bytes += (unsigned) len;
+	bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+				  struct iovec *iov, int iovcnt, size_t len,
+				  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_writev(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+	/* Update stats even though technically transfer has not happened. */
+	bs->rd_bytes += (unsigned) len;
+	bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -85,6 +85,13 @@ int bdrv_commit(BlockDriverState *bs);
 /* async block I/O */
 typedef struct BlockDriverAIOCB BlockDriverAIOCB;
 typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+typedef BlockDriverAIOCB *BlockDriverAIOIOV(BlockDriverState *bs,
+					    int64_t sector_num,
+					    struct iovec *iov,
+					    int iovnct,
+					    size_t len,
+					    BlockDriverCompletionFunc *cb,
+					    void *opaque);
 
 BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
                                 uint8_t *buf, int nb_sectors,
@@ -93,6 +100,12 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDr
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+				 struct iovec *iov, int iovnct, size_t len,
+				 BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+				  struct iovec *iov, int iovnct, size_t len,
+				  BlockDriverCompletionFunc *cb, void *opaque);
 
 int qemu_key_check(BlockDriverState *bs, const char *name);
 
diff --git a/block_int.h b/block_int.h
--- a/block_int.h
+++ b/block_int.h
@@ -55,6 +55,8 @@ struct BlockDriver {
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOIOV *bdrv_aio_readv;
+    BlockDriverAIOIOV *bdrv_aio_writev;
     int aiocb_size;
 
     const char *protocol_name;

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] [PATCH 5 of 5] bdrv_aio_readv/writev
@ 2008-12-12 18:16   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

From: Andrea Arcangeli <aarcange@redhat.com>

bdrv_aio_readv/writev methods (depends on lower level bdrv_aio_readv).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/block.c b/block.c
--- a/block.c
+++ b/block.c
@@ -1296,6 +1296,50 @@ void bdrv_aio_cancel(BlockDriverAIOCB *a
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+				 struct iovec *iov, int iovcnt, size_t len,
+				 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_readv(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+	/* Update stats even though technically transfer has not happened. */
+	bs->rd_bytes += (unsigned) len;
+	bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+				  struct iovec *iov, int iovcnt, size_t len,
+				  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_writev(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+	/* Update stats even though technically transfer has not happened. */
+	bs->rd_bytes += (unsigned) len;
+	bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -85,6 +85,13 @@ int bdrv_commit(BlockDriverState *bs);
 /* async block I/O */
 typedef struct BlockDriverAIOCB BlockDriverAIOCB;
 typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+typedef BlockDriverAIOCB *BlockDriverAIOIOV(BlockDriverState *bs,
+					    int64_t sector_num,
+					    struct iovec *iov,
+					    int iovnct,
+					    size_t len,
+					    BlockDriverCompletionFunc *cb,
+					    void *opaque);
 
 BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
                                 uint8_t *buf, int nb_sectors,
@@ -93,6 +100,12 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDr
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+				 struct iovec *iov, int iovnct, size_t len,
+				 BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+				  struct iovec *iov, int iovnct, size_t len,
+				  BlockDriverCompletionFunc *cb, void *opaque);
 
 int qemu_key_check(BlockDriverState *bs, const char *name);
 
diff --git a/block_int.h b/block_int.h
--- a/block_int.h
+++ b/block_int.h
@@ -55,6 +55,8 @@ struct BlockDriver {
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOIOV *bdrv_aio_readv;
+    BlockDriverAIOIOV *bdrv_aio_writev;
     int aiocb_size;
 
     const char *protocol_name;

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 4 of 5] dma api
  2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 18:55     ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-12 18:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, chrisw, avi, Gerd Hoffmann, kvm

On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
>  One major limitation for KVM today is the lack of a proper way to write drivers
>  in a way that allows the host OS to use direct DMA to the guest physical memory
>  to avoid any intermediate copy. The only API provided to drivers seems to be
>  the cpu_physical_memory_rw and that enforces all drivers to bounce and trash
>  cpu caches and be memory bound. This new DMA API instead allows drivers to use
>  a pci_dma_sg method for SG I/O that will translate the guest physical addresses
>  to host virutal addresses and it will call two operation, one is a submit
>  method and one is the complete method. The pci_dma_sg may have to bounce buffer
>  internally and to limit the max bounce size it may have to submit I/O in pieces
>  with multiple submit calls.
>
>  All we care about is the performance of the direct path, so I tried to
>  avoid dynamic allocations there to avoid entering glibc.
>
>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

>  + * QEMU PCI DMA operations

>  +typedef struct QEMUPciDmaSgParam {
>  +    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
>  +    QEMUPciDmaSgComplete *pci_dma_sg_complete;
>  +    void *pci_dma_sg_opaque;

>  +    QEMUPciDmaSg *sg;

>  +    struct QEMUPciDmaSgParam *next;

>  +} QEMUPciDmaSgParam;

Still "PCI" here and other places, why? Even the "pci_dev" should be
bus_opaque for other buses.

+/* pci_dma.c */

Here pci_ prefix is even incorrect given that the file is now dma.c.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 4 of 5] dma api
@ 2008-12-12 18:55     ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-12 18:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: chrisw, kvm, avi, Gerd Hoffmann

On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
>  One major limitation for KVM today is the lack of a proper way to write drivers
>  in a way that allows the host OS to use direct DMA to the guest physical memory
>  to avoid any intermediate copy. The only API provided to drivers seems to be
>  the cpu_physical_memory_rw and that enforces all drivers to bounce and trash
>  cpu caches and be memory bound. This new DMA API instead allows drivers to use
>  a pci_dma_sg method for SG I/O that will translate the guest physical addresses
>  to host virutal addresses and it will call two operation, one is a submit
>  method and one is the complete method. The pci_dma_sg may have to bounce buffer
>  internally and to limit the max bounce size it may have to submit I/O in pieces
>  with multiple submit calls.
>
>  All we care about is the performance of the direct path, so I tried to
>  avoid dynamic allocations there to avoid entering glibc.
>
>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

>  + * QEMU PCI DMA operations

>  +typedef struct QEMUPciDmaSgParam {
>  +    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
>  +    QEMUPciDmaSgComplete *pci_dma_sg_complete;
>  +    void *pci_dma_sg_opaque;

>  +    QEMUPciDmaSg *sg;

>  +    struct QEMUPciDmaSgParam *next;

>  +} QEMUPciDmaSgParam;

Still "PCI" here and other places, why? Even the "pci_dev" should be
bus_opaque for other buses.

+/* pci_dma.c */

Here pci_ prefix is even incorrect given that the file is now dma.c.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 19:00     ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-12 19:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, chrisw, avi, Gerd Hoffmann, kvm

On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
>  Add can_dma and post_dma methods needed before/after direct IO to guest
>  physical memory.
>
>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

>  +        /* nonlinear range */
>  +        if (pd_first != pd)
>  +            return NULL;

In my tests on Sparc32, IOMMU can map a linear DVMA range to several
non-linear physical pages, so this case should be handled correctly.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 19:00     ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-12 19:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: chrisw, kvm, avi, Gerd Hoffmann

On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
>  Add can_dma and post_dma methods needed before/after direct IO to guest
>  physical memory.
>
>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

>  +        /* nonlinear range */
>  +        if (pd_first != pd)
>  +            return NULL;

In my tests on Sparc32, IOMMU can map a linear DVMA range to several
non-linear physical pages, so this case should be handled correctly.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 1 of 5] fix cpu_physical_memory len
  2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 19:06     ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> Be consistent and have length be size_t for all methods.
>   

ram_addr_t would be better than size_t here.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 1 of 5] fix cpu_physical_memory len
@ 2008-12-12 19:06     ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> Be consistent and have length be size_t for all methods.
>   

ram_addr_t would be better than size_t here.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 19:15     ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:15 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> Add can_dma and post_dma methods needed before/after direct IO to guest
> physical memory.
>   

I think any API based on a can_dma abstraction is wrong.  The 
write_post_dma thing is equally wrong.

The concept of "dma" that you're introducing is not correct.

The DMA API should have the following properties:

1) You attempt to map a physical address.  This effectively is a lock or 
pin operation.
  a) In the process of this, you get a virtual address that you can 
manipulate.
2) You do your IO to the virtual address
3) You indicate how much of the memory you have dirtied
4) You unmap or unlock that memory region.  The virtual address is now 
no longer valid.

This could correspond to a:

void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
int is_write);

void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
size, void *mapping, int is_dirty);

The whole dma.c thing should not exist.  If we're going to introduce a 
higher level API, it should be a PCI DMA API.

Something like virtio could use this API directly seeing as it doesn't 
really live on a PCI bus in real life.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 19:15     ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:15 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> Add can_dma and post_dma methods needed before/after direct IO to guest
> physical memory.
>   

I think any API based on a can_dma abstraction is wrong.  The 
write_post_dma thing is equally wrong.

The concept of "dma" that you're introducing is not correct.

The DMA API should have the following properties:

1) You attempt to map a physical address.  This effectively is a lock or 
pin operation.
  a) In the process of this, you get a virtual address that you can 
manipulate.
2) You do your IO to the virtual address
3) You indicate how much of the memory you have dirtied
4) You unmap or unlock that memory region.  The virtual address is now 
no longer valid.

This could correspond to a:

void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
int is_write);

void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
size, void *mapping, int is_dirty);

The whole dma.c thing should not exist.  If we're going to introduce a 
higher level API, it should be a PCI DMA API.

Something like virtio could use this API directly seeing as it doesn't 
really live on a PCI bus in real life.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:00     ` Blue Swirl
@ 2008-12-12 19:18       ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:18 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel, chrisw, avi, Gerd Hoffmann, kvm

Blue Swirl wrote:
> On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
>   
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>>  Add can_dma and post_dma methods needed before/after direct IO to guest
>>  physical memory.
>>
>>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
>>     
>
>   
>>  +        /* nonlinear range */
>>  +        if (pd_first != pd)
>>  +            return NULL;
>>     
>
> In my tests on Sparc32, IOMMU can map a linear DVMA range to several
> non-linear physical pages, so this case should be handled correctly.
>   

I think we should go back to Fabrice's earliest suggestion here.  We 
should just have a simple map/unmap lock/unlock API for physical 
memory.  That should be the base API IMHO.

As long the map function goes from guest physical => host virtual, it 
can work for everything we care about.

This is orthogonal to an API dealing with scatter/gather lists and 
translation to and from them.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 19:18       ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:18 UTC (permalink / raw)
  To: Blue Swirl; +Cc: chrisw, Gerd Hoffmann, qemu-devel, kvm, avi

Blue Swirl wrote:
> On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
>   
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>>  Add can_dma and post_dma methods needed before/after direct IO to guest
>>  physical memory.
>>
>>  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
>>     
>
>   
>>  +        /* nonlinear range */
>>  +        if (pd_first != pd)
>>  +            return NULL;
>>     
>
> In my tests on Sparc32, IOMMU can map a linear DVMA range to several
> non-linear physical pages, so this case should be handled correctly.
>   

I think we should go back to Fabrice's earliest suggestion here.  We 
should just have a simple map/unmap lock/unlock API for physical 
memory.  That should be the base API IMHO.

As long the map function goes from guest physical => host virtual, it 
can work for everything we care about.

This is orthogonal to an API dealing with scatter/gather lists and 
translation to and from them.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 1 of 5] fix cpu_physical_memory len
  2008-12-12 19:06     ` [Qemu-devel] " Anthony Liguori
@ 2008-12-12 19:26       ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 19:26 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

On Fri, Dec 12, 2008 at 01:06:56PM -0600, Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>> Be consistent and have length be size_t for all methods.
>>   
>
> ram_addr_t would be better than size_t here.

Yes, that is feasible even if the dma api output remains a raw iovec
(as it'll surely bounce, and the bouncing internally can restarts with
unsigned long long length). To explain why it's set to a size_t, it's
just that I didn't think an emulated device would ever attempt a dma
on a >4G region on a 32bit host, and I was suggested to make this
assumption by the current code that can't even handle that on a 64bit
host (I made it possible on a 64bit host, on a 64bit host it makes
some sense as there can really be that much ram allocated). For 32bit
it mostly makes sense for mmio regions but that sounds a real
weirdness to do such a large dma on a mmio region. So I thought
sticking with size_t would less prone for truncation errors and I
could the sanity checking only once (currently you'd get a graceful
driver failure with the submit handler getting an error, if you
attempt that).

But I can change to ram_addr_t if you like. It's up to you!

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 1 of 5] fix cpu_physical_memory len
@ 2008-12-12 19:26       ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 19:26 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

On Fri, Dec 12, 2008 at 01:06:56PM -0600, Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>> Be consistent and have length be size_t for all methods.
>>   
>
> ram_addr_t would be better than size_t here.

Yes, that is feasible even if the dma api output remains a raw iovec
(as it'll surely bounce, and the bouncing internally can restarts with
unsigned long long length). To explain why it's set to a size_t, it's
just that I didn't think an emulated device would ever attempt a dma
on a >4G region on a 32bit host, and I was suggested to make this
assumption by the current code that can't even handle that on a 64bit
host (I made it possible on a 64bit host, on a 64bit host it makes
some sense as there can really be that much ram allocated). For 32bit
it mostly makes sense for mmio regions but that sounds a real
weirdness to do such a large dma on a mmio region. So I thought
sticking with size_t would less prone for truncation errors and I
could the sanity checking only once (currently you'd get a graceful
driver failure with the submit handler getting an error, if you
attempt that).

But I can change to ram_addr_t if you like. It's up to you!

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:15     ` [Qemu-devel] " Anthony Liguori
@ 2008-12-12 19:37       ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 19:37 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
> 1) You attempt to map a physical address.  This effectively is a lock or 
> pin operation.

lock or pin for what? There's nothing to pin or lock here. Perhaps one
day we'll have to lock or pin against something, dunno, then we'll add
whatever pin or lock, otherwise why don't we also thread qcow2 while
we're at it, sounds more worth it than adding a lock or pin that isn't
actually needed.

>  a) In the process of this, you get a virtual address that you can 
> manipulate.

That's what can_dma does. If it can, it generates a dma address backed
by ram it does. But if it can't, it won't. This only works for direct I/O.

> 2) You do your IO to the virtual address

This is done by dma.c.

> 3) You indicate how much of the memory you have dirtied

That is done by the post_dma handler called by dma.c.

> 4) You unmap or unlock that memory region.  The virtual address is now no 
> longer valid.

Again not needed.

> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
> is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

So you mean renaming can_dma to map and post_dma to unmap and not
adding any lock/pin at all that would otherwise gratuitously slow it
down?

I've no idea why you should ever want to call any unmap for reads
though... and calling it with your naming conventions, and not
invoking it for reads (there's absolutely no good reason to invoke any
unmap methods for reads, and it can only hurt at runtime) would be
weird in my view.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 19:37       ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 19:37 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
> 1) You attempt to map a physical address.  This effectively is a lock or 
> pin operation.

lock or pin for what? There's nothing to pin or lock here. Perhaps one
day we'll have to lock or pin against something, dunno, then we'll add
whatever pin or lock, otherwise why don't we also thread qcow2 while
we're at it, sounds more worth it than adding a lock or pin that isn't
actually needed.

>  a) In the process of this, you get a virtual address that you can 
> manipulate.

That's what can_dma does. If it can, it generates a dma address backed
by ram it does. But if it can't, it won't. This only works for direct I/O.

> 2) You do your IO to the virtual address

This is done by dma.c.

> 3) You indicate how much of the memory you have dirtied

That is done by the post_dma handler called by dma.c.

> 4) You unmap or unlock that memory region.  The virtual address is now no 
> longer valid.

Again not needed.

> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
> is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

So you mean renaming can_dma to map and post_dma to unmap and not
adding any lock/pin at all that would otherwise gratuitously slow it
down?

I've no idea why you should ever want to call any unmap for reads
though... and calling it with your naming conventions, and not
invoking it for reads (there's absolutely no good reason to invoke any
unmap methods for reads, and it can only hurt at runtime) would be
weird in my view.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:15     ` [Qemu-devel] " Anthony Liguori
@ 2008-12-12 19:39       ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:39 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>> Add can_dma and post_dma methods needed before/after direct IO to guest
>> physical memory.
>>   
>
> I think any API based on a can_dma abstraction is wrong.  The 
> write_post_dma thing is equally wrong.
>
> The concept of "dma" that you're introducing is not correct.
>
> The DMA API should have the following properties:
>
> 1) You attempt to map a physical address.  This effectively is a lock 
> or pin operation.
>  a) In the process of this, you get a virtual address that you can 
> manipulate.
> 2) You do your IO to the virtual address
> 3) You indicate how much of the memory you have dirtied
> 4) You unmap or unlock that memory region.  The virtual address is now 
> no longer valid.
>
> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

Let me clarify this a bit more.  The problem we're trying to address 
today is the encapsulating knowledge of phys_ram_base.  We want to 
minimize the amount of code that makes any assumptions about 
phys_ram_base.  Your current API still accesses phys_ram_base directly 
in the PCI DMA API.  The only real improvement compared to the current 
virtio code is that you properly handle MMIO.  This is not just about 
layout but this also includes the fact that in the future, guest memory 
could be discontiguous in QEMU (think memory hotplug).

Or, in the case of Xen, it may not be possible to have all of guest 
memory mapped at any given time.  The API that I propose above limits 
the knowledge of how to access guest memory to exec.c and makes it very 
easy to deal with discontiguous or even partially mapped guest memory in 
QEMU.  Any other APIs built around it (like a PCI DMA API) don't have to 
have any special knowledge about how guest memory is stored.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 19:39       ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 19:39 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> From: Andrea Arcangeli <aarcange@redhat.com>
>>
>> Add can_dma and post_dma methods needed before/after direct IO to guest
>> physical memory.
>>   
>
> I think any API based on a can_dma abstraction is wrong.  The 
> write_post_dma thing is equally wrong.
>
> The concept of "dma" that you're introducing is not correct.
>
> The DMA API should have the following properties:
>
> 1) You attempt to map a physical address.  This effectively is a lock 
> or pin operation.
>  a) In the process of this, you get a virtual address that you can 
> manipulate.
> 2) You do your IO to the virtual address
> 3) You indicate how much of the memory you have dirtied
> 4) You unmap or unlock that memory region.  The virtual address is now 
> no longer valid.
>
> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

Let me clarify this a bit more.  The problem we're trying to address 
today is the encapsulating knowledge of phys_ram_base.  We want to 
minimize the amount of code that makes any assumptions about 
phys_ram_base.  Your current API still accesses phys_ram_base directly 
in the PCI DMA API.  The only real improvement compared to the current 
virtio code is that you properly handle MMIO.  This is not just about 
layout but this also includes the fact that in the future, guest memory 
could be discontiguous in QEMU (think memory hotplug).

Or, in the case of Xen, it may not be possible to have all of guest 
memory mapped at any given time.  The API that I propose above limits 
the knowledge of how to access guest memory to exec.c and makes it very 
easy to deal with discontiguous or even partially mapped guest memory in 
QEMU.  Any other APIs built around it (like a PCI DMA API) don't have to 
have any special knowledge about how guest memory is stored.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:18       ` Anthony Liguori
  (?)
@ 2008-12-12 20:05       ` Blue Swirl
  2008-12-12 20:10           ` Anthony Liguori
  -1 siblings, 1 reply; 111+ messages in thread
From: Blue Swirl @ 2008-12-12 20:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: chrisw, Gerd Hoffmann, kvm, avi

On 12/12/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl wrote:
>
> > On 12/12/08, Andrea Arcangeli <aarcange@redhat.com> wrote:
> >
> >
> > > From: Andrea Arcangeli <aarcange@redhat.com>
> > >
> > >  Add can_dma and post_dma methods needed before/after direct IO to guest
> > >  physical memory.
> > >
> > >  Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> > >
> > >
> >
> >
> >
> > >  +        /* nonlinear range */
> > >  +        if (pd_first != pd)
> > >  +            return NULL;
> > >
> > >
> >
> > In my tests on Sparc32, IOMMU can map a linear DVMA range to several
> > non-linear physical pages, so this case should be handled correctly.
> >
> >
>
>  I think we should go back to Fabrice's earliest suggestion here.  We should
> just have a simple map/unmap lock/unlock API for physical memory.  That
> should be the base API IMHO.

Beautiful!

>  As long the map function goes from guest physical => host virtual, it can
> work for everything we care about.
>
>  This is orthogonal to an API dealing with scatter/gather lists and
> translation to and from them.

So again we tried to solve too many problems at once.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:37       ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 20:09         ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 20:09 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>   
>> 1) You attempt to map a physical address.  This effectively is a lock or 
>> pin operation.
>>     
>
> lock or pin for what? There's nothing to pin or lock here. Perhaps one
> day we'll have to lock or pin against something, dunno, then we'll add
> whatever pin or lock, otherwise why don't we also thread qcow2 while
> we're at it, sounds more worth it than adding a lock or pin that isn't
> actually needed.
>   

Please try to understand what people are suggesting to you.  I have no 
idea why you bring up threading qcow2.  This is not a topic that hasn't 
been discussed at great length before.  You didn't even fix any of the 
comments people made against the series since the last time you posted it.

>>  a) In the process of this, you get a virtual address that you can 
>> manipulate.
>>     
>
> That's what can_dma does. If it can, it generates a dma address backed
> by ram it does. But if it can't, it won't. This only works for direct I/O.
>   

No, can_dma does two things in one.  It checks to see if there physical 
region is MMIO, and it returns phys_ram_base + PA otherwise.

This is not helpful though in the general case.  It's not an API that 
extends well for things like memory hotplug, etc.

If we're not building APIs for the future, what's the point of this 
exercise in the first place?  virtio already has the bits to do 
zero-copy memory transfers.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 20:09         ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 20:09 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>   
>> 1) You attempt to map a physical address.  This effectively is a lock or 
>> pin operation.
>>     
>
> lock or pin for what? There's nothing to pin or lock here. Perhaps one
> day we'll have to lock or pin against something, dunno, then we'll add
> whatever pin or lock, otherwise why don't we also thread qcow2 while
> we're at it, sounds more worth it than adding a lock or pin that isn't
> actually needed.
>   

Please try to understand what people are suggesting to you.  I have no 
idea why you bring up threading qcow2.  This is not a topic that hasn't 
been discussed at great length before.  You didn't even fix any of the 
comments people made against the series since the last time you posted it.

>>  a) In the process of this, you get a virtual address that you can 
>> manipulate.
>>     
>
> That's what can_dma does. If it can, it generates a dma address backed
> by ram it does. But if it can't, it won't. This only works for direct I/O.
>   

No, can_dma does two things in one.  It checks to see if there physical 
region is MMIO, and it returns phys_ram_base + PA otherwise.

This is not helpful though in the general case.  It's not an API that 
extends well for things like memory hotplug, etc.

If we're not building APIs for the future, what's the point of this 
exercise in the first place?  virtio already has the bits to do 
zero-copy memory transfers.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 20:05       ` Blue Swirl
@ 2008-12-12 20:10           ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 20:10 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel, chrisw, Gerd Hoffmann, kvm, avi

Blue Swirl wrote:
> On 12/12/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
>   
>>  I think we should go back to Fabrice's earliest suggestion here.  We should
>> just have a simple map/unmap lock/unlock API for physical memory.  That
>> should be the base API IMHO.
>>     
>
> Beautiful!
>
>   
>>  As long the map function goes from guest physical => host virtual, it can
>> work for everything we care about.
>>
>>  This is orthogonal to an API dealing with scatter/gather lists and
>> translation to and from them.
>>     
>
> So again we tried to solve too many problems at once.
>   

Yup.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 20:10           ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-12 20:10 UTC (permalink / raw)
  To: Blue Swirl; +Cc: chrisw, avi, qemu-devel, kvm, Gerd Hoffmann

Blue Swirl wrote:
> On 12/12/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
>   
>>  I think we should go back to Fabrice's earliest suggestion here.  We should
>> just have a simple map/unmap lock/unlock API for physical memory.  That
>> should be the base API IMHO.
>>     
>
> Beautiful!
>
>   
>>  As long the map function goes from guest physical => host virtual, it can
>> work for everything we care about.
>>
>>  This is orthogonal to an API dealing with scatter/gather lists and
>> translation to and from them.
>>     
>
> So again we tried to solve too many problems at once.
>   

Yup.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:37       ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-12 20:25         ` Gerd Hoffmann
  -1 siblings, 0 replies; 111+ messages in thread
From: Gerd Hoffmann @ 2008-12-12 20:25 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>> 1) You attempt to map a physical address.  This effectively is a lock or 
>> pin operation.
> 
> lock or pin for what? There's nothing to pin or lock here.

Oh, there is in Xen context (i.e. qemu in dom0 handles device emulation
for xen domU guests).  You can ask Xen to map the grant (or guest pfn)
here.  I think doing so will kill the need for the mapcache patches
xensource carries in the xenified qemu tree.

>> 4) You unmap or unlock that memory region.  The virtual address is now no 
>> longer valid.
> 
> Again not needed.

Likewise useful for Xen.

cheers,
  Gerd


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-12 20:25         ` Gerd Hoffmann
  0 siblings, 0 replies; 111+ messages in thread
From: Gerd Hoffmann @ 2008-12-12 20:25 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, qemu-devel, kvm

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>> 1) You attempt to map a physical address.  This effectively is a lock or 
>> pin operation.
> 
> lock or pin for what? There's nothing to pin or lock here.

Oh, there is in Xen context (i.e. qemu in dom0 handles device emulation
for xen domU guests).  You can ask Xen to map the grant (or guest pfn)
here.  I think doing so will kill the need for the mapcache patches
xensource carries in the xenified qemu tree.

>> 4) You unmap or unlock that memory region.  The virtual address is now no 
>> longer valid.
> 
> Again not needed.

Likewise useful for Xen.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:39       ` [Qemu-devel] " Anthony Liguori
@ 2008-12-13  9:22         ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-13  9:22 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Anthony Liguori wrote:
>>
>> I think any API based on a can_dma abstraction is wrong.  The 
>> write_post_dma thing is equally wrong.
>>
>> The concept of "dma" that you're introducing is not correct.
>>
>> The DMA API should have the following properties:
>>
>> 1) You attempt to map a physical address.  This effectively is a lock 
>> or pin operation.
>>  a) In the process of this, you get a virtual address that you can 
>> manipulate.
>> 2) You do your IO to the virtual address
>> 3) You indicate how much of the memory you have dirtied
>> 4) You unmap or unlock that memory region.  The virtual address is 
>> now no longer valid.
>>
>> This could correspond to a:
>>
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>> size, int is_write);
>>
>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>> ram_addr_t size, void *mapping, int is_dirty);
>
> Let me clarify this a bit more.  The problem we're trying to address 
> today is the encapsulating knowledge of phys_ram_base.  We want to 
> minimize the amount of code that makes any assumptions about 
> phys_ram_base.  Your current API still accesses phys_ram_base directly 
> in the PCI DMA API.  The only real improvement compared to the current 
> virtio code is that you properly handle MMIO.  This is not just about 
> layout but this also includes the fact that in the future, guest 
> memory could be discontiguous in QEMU (think memory hotplug).

There are two more problems the dma api solves:

- DMA into mmio regions; this requires bouncing
- DMA with an associated transform (xor, byteswap); also requires bouncing

In turn, bouncing requires splitting large requests to avoid unbounded 
memory allocation.

While I think _map/_unmap is an improvement over can_dma(), this API 
can't handle bounded bouncing, and so a separate layer (dma.c) is still 
necessary.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13  9:22         ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-13  9:22 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Anthony Liguori wrote:
>>
>> I think any API based on a can_dma abstraction is wrong.  The 
>> write_post_dma thing is equally wrong.
>>
>> The concept of "dma" that you're introducing is not correct.
>>
>> The DMA API should have the following properties:
>>
>> 1) You attempt to map a physical address.  This effectively is a lock 
>> or pin operation.
>>  a) In the process of this, you get a virtual address that you can 
>> manipulate.
>> 2) You do your IO to the virtual address
>> 3) You indicate how much of the memory you have dirtied
>> 4) You unmap or unlock that memory region.  The virtual address is 
>> now no longer valid.
>>
>> This could correspond to a:
>>
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>> size, int is_write);
>>
>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>> ram_addr_t size, void *mapping, int is_dirty);
>
> Let me clarify this a bit more.  The problem we're trying to address 
> today is the encapsulating knowledge of phys_ram_base.  We want to 
> minimize the amount of code that makes any assumptions about 
> phys_ram_base.  Your current API still accesses phys_ram_base directly 
> in the PCI DMA API.  The only real improvement compared to the current 
> virtio code is that you properly handle MMIO.  This is not just about 
> layout but this also includes the fact that in the future, guest 
> memory could be discontiguous in QEMU (think memory hotplug).

There are two more problems the dma api solves:

- DMA into mmio regions; this requires bouncing
- DMA with an associated transform (xor, byteswap); also requires bouncing

In turn, bouncing requires splitting large requests to avoid unbounded 
memory allocation.

While I think _map/_unmap is an improvement over can_dma(), this API 
can't handle bounded bouncing, and so a separate layer (dma.c) is still 
necessary.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:15     ` [Qemu-devel] " Anthony Liguori
@ 2008-12-13 14:39       ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-13 14:39 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
> is_write);

Just a side note (doesn't mean I agree with the above), it doesn't
make sense to use an ram_addr_t size when you return a 32bit address
on 32bit qemu build.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 14:39       ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-13 14:39 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
> is_write);

Just a side note (doesn't mean I agree with the above), it doesn't
make sense to use an ram_addr_t size when you return a 32bit address
on 32bit qemu build.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13  9:22         ` Avi Kivity
@ 2008-12-13 16:45           ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 16:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>
>>> I think any API based on a can_dma abstraction is wrong.  The 
>>> write_post_dma thing is equally wrong.
>>>
>>> The concept of "dma" that you're introducing is not correct.
>>>
>>> The DMA API should have the following properties:
>>>
>>> 1) You attempt to map a physical address.  This effectively is a 
>>> lock or pin operation.
>>>  a) In the process of this, you get a virtual address that you can 
>>> manipulate.
>>> 2) You do your IO to the virtual address
>>> 3) You indicate how much of the memory you have dirtied
>>> 4) You unmap or unlock that memory region.  The virtual address is 
>>> now no longer valid.
>>>
>>> This could correspond to a:
>>>
>>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>>> size, int is_write);
>>>
>>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>>> ram_addr_t size, void *mapping, int is_dirty);
>>
>> Let me clarify this a bit more.  The problem we're trying to address 
>> today is the encapsulating knowledge of phys_ram_base.  We want to 
>> minimize the amount of code that makes any assumptions about 
>> phys_ram_base.  Your current API still accesses phys_ram_base 
>> directly in the PCI DMA API.  The only real improvement compared to 
>> the current virtio code is that you properly handle MMIO.  This is 
>> not just about layout but this also includes the fact that in the 
>> future, guest memory could be discontiguous in QEMU (think memory 
>> hotplug).
>
> There are two more problems the dma api solves:
>
> - DMA into mmio regions; this requires bouncing

The map() API I proposed above should do bouncing to MMIO regions.  To 
deal with unbounded allocation, you can simply fail when the mapping 
allocation has reached some high limit.  Calling code needs to cope with 
the fact that map'ing may succeed or fail.

> - DMA with an associated transform (xor, byteswap); also requires 
> bouncing

At this layer of the API, this is unnecessary.  At the PCI layer, you 
would need to handle this and I'd suggest taking the same approach as above.

> In turn, bouncing requires splitting large requests to avoid unbounded 
> memory allocation.
>
> While I think _map/_unmap is an improvement over can_dma(), this API 
> can't handle bounded bouncing, and so a separate layer (dma.c) is 
> still necessary.

I think it can handled bounded bouncing fine.  It's a matter of ensuring 
the bounce buffer max is sufficiently large and ensuring that any client 
code can cope with map failures.  In fact, there probably needs to be a 
notifiers API that is invoked whenever an unmap() happens so that if an 
asynchronous request is waiting because of a map failure, it can be 
notified as to when it should try again.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 16:45           ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 16:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>
>>> I think any API based on a can_dma abstraction is wrong.  The 
>>> write_post_dma thing is equally wrong.
>>>
>>> The concept of "dma" that you're introducing is not correct.
>>>
>>> The DMA API should have the following properties:
>>>
>>> 1) You attempt to map a physical address.  This effectively is a 
>>> lock or pin operation.
>>>  a) In the process of this, you get a virtual address that you can 
>>> manipulate.
>>> 2) You do your IO to the virtual address
>>> 3) You indicate how much of the memory you have dirtied
>>> 4) You unmap or unlock that memory region.  The virtual address is 
>>> now no longer valid.
>>>
>>> This could correspond to a:
>>>
>>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>>> size, int is_write);
>>>
>>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>>> ram_addr_t size, void *mapping, int is_dirty);
>>
>> Let me clarify this a bit more.  The problem we're trying to address 
>> today is the encapsulating knowledge of phys_ram_base.  We want to 
>> minimize the amount of code that makes any assumptions about 
>> phys_ram_base.  Your current API still accesses phys_ram_base 
>> directly in the PCI DMA API.  The only real improvement compared to 
>> the current virtio code is that you properly handle MMIO.  This is 
>> not just about layout but this also includes the fact that in the 
>> future, guest memory could be discontiguous in QEMU (think memory 
>> hotplug).
>
> There are two more problems the dma api solves:
>
> - DMA into mmio regions; this requires bouncing

The map() API I proposed above should do bouncing to MMIO regions.  To 
deal with unbounded allocation, you can simply fail when the mapping 
allocation has reached some high limit.  Calling code needs to cope with 
the fact that map'ing may succeed or fail.

> - DMA with an associated transform (xor, byteswap); also requires 
> bouncing

At this layer of the API, this is unnecessary.  At the PCI layer, you 
would need to handle this and I'd suggest taking the same approach as above.

> In turn, bouncing requires splitting large requests to avoid unbounded 
> memory allocation.
>
> While I think _map/_unmap is an improvement over can_dma(), this API 
> can't handle bounded bouncing, and so a separate layer (dma.c) is 
> still necessary.

I think it can handled bounded bouncing fine.  It's a matter of ensuring 
the bounce buffer max is sufficiently large and ensuring that any client 
code can cope with map failures.  In fact, there probably needs to be a 
notifiers API that is invoked whenever an unmap() happens so that if an 
asynchronous request is waiting because of a map failure, it can be 
notified as to when it should try again.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 14:39       ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-13 16:46         ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 16:46 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>   
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
>> is_write);
>>     
>
> Just a side note (doesn't mean I agree with the above), it doesn't
> make sense to use an ram_addr_t size when you return a 32bit address
> on 32bit qemu build.
>   

size_t is completely wrong for 64-bit targets on 32-bit hosts.  
ram_addr_t is the type we use for guest ram size.  It's 64-bit all of 
the time simply because it's easier to do that and we decided that the 
little bit of wasted space/computations were not a problem.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 16:46         ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 16:46 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>   
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, int 
>> is_write);
>>     
>
> Just a side note (doesn't mean I agree with the above), it doesn't
> make sense to use an ram_addr_t size when you return a 32bit address
> on 32bit qemu build.
>   

size_t is completely wrong for 64-bit targets on 32-bit hosts.  
ram_addr_t is the type we use for guest ram size.  It's 64-bit all of 
the time simply because it's easier to do that and we decided that the 
little bit of wasted space/computations were not a problem.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 16:46         ` [Qemu-devel] " Anthony Liguori
@ 2008-12-13 16:53           ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-13 16:53 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

On Sat, Dec 13, 2008 at 10:46:49AM -0600, Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>>   
>>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
>>> int is_write);
>>>     
>>
>> Just a side note (doesn't mean I agree with the above), it doesn't
>> make sense to use an ram_addr_t size when you return a 32bit address
>> on 32bit qemu build.
>>   
>
> size_t is completely wrong for 64-bit targets on 32-bit hosts.  ram_addr_t 
> is the type we use for guest ram size.  It's 64-bit all of the time simply 
> because it's easier to do that and we decided that the little bit of wasted 
> space/computations were not a problem.

Not sure why you think I'm suggesting you to use size_t. I'm just
trying to tell you that if you insist in this
64bit-guest-on-32bit-host-is-dead-and-obsolete-to-support (i.e. if you
pass a ram_addr_t size to cpu_physical_memory_map) you've at least to
return ram_addr_t too). 'void *' is like size_t so the above API
getting ram_addr_t length and returning 'void *', can't possibly be
sane.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 16:53           ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-13 16:53 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

On Sat, Dec 13, 2008 at 10:46:49AM -0600, Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>>   
>>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
>>> int is_write);
>>>     
>>
>> Just a side note (doesn't mean I agree with the above), it doesn't
>> make sense to use an ram_addr_t size when you return a 32bit address
>> on 32bit qemu build.
>>   
>
> size_t is completely wrong for 64-bit targets on 32-bit hosts.  ram_addr_t 
> is the type we use for guest ram size.  It's 64-bit all of the time simply 
> because it's easier to do that and we decided that the little bit of wasted 
> space/computations were not a problem.

Not sure why you think I'm suggesting you to use size_t. I'm just
trying to tell you that if you insist in this
64bit-guest-on-32bit-host-is-dead-and-obsolete-to-support (i.e. if you
pass a ram_addr_t size to cpu_physical_memory_map) you've at least to
return ram_addr_t too). 'void *' is like size_t so the above API
getting ram_addr_t length and returning 'void *', can't possibly be
sane.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 16:53           ` [Qemu-devel] " Andrea Arcangeli
  (?)
@ 2008-12-13 17:54           ` Andreas Färber
  -1 siblings, 0 replies; 111+ messages in thread
From: Andreas Färber @ 2008-12-13 17:54 UTC (permalink / raw)
  To: qemu-devel


Am 13.12.2008 um 17:53 schrieb Andrea Arcangeli:

> On Sat, Dec 13, 2008 at 10:46:49AM -0600, Anthony Liguori wrote:
>> Andrea Arcangeli wrote:
>>> On Fri, Dec 12, 2008 at 01:15:13PM -0600, Anthony Liguori wrote:
>>>
>>>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t  
>>>> size,
>>>> int is_write);
>>>>
>>>
>>> Just a side note (doesn't mean I agree with the above), it doesn't
>>> make sense to use an ram_addr_t size when you return a 32bit address
>>> on 32bit qemu build.
>>>
>>
>> size_t is completely wrong for 64-bit targets on 32-bit hosts.   
>> ram_addr_t
>> is the type we use for guest ram size.  It's 64-bit all of the time  
>> simply
>> because it's easier to do that and we decided that the little bit  
>> of wasted
>> space/computations were not a problem.
>
> Not sure why you think I'm suggesting you to use size_t. I'm just
> trying to tell you that if you insist in this
> 64bit-guest-on-32bit-host-is-dead-and-obsolete-to-support (i.e. if you
> pass a ram_addr_t size to cpu_physical_memory_map) you've at least to
> return ram_addr_t too). 'void *' is like size_t so the above API
> getting ram_addr_t length and returning 'void *', can't possibly be
> sane.

Since memory is allocated on the host with the host's limits, void*  
seems correct.

That you can't alloc, e.g., 4 GB of size on a 32-bit host is something  
the DMA API under discussion would need to handle as error. To detect  
such an attempt, the API needs the guest's data type for the size. :)  
It would then either assert or return NULL.

Don't forget that this is not only about supposedly "dead" x86 hosts,  
imagine ppc64 on ppc or amd64 on arm etc.

Andreas

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 16:45           ` Anthony Liguori
@ 2008-12-13 19:48             ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-13 19:48 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Anthony Liguori wrote:
>>
>> - DMA into mmio regions; this requires bouncing
>
> The map() API I proposed above should do bouncing to MMIO regions.  To
> deal with unbounded allocation, you can simply fail when the mapping
> allocation has reached some high limit.  Calling code needs to cope
> with the fact that map'ing may succeed or fail.

There are N users of this code, all of which would need to cope with the
failure.  Or there could be one user (dma.c) which handles the failure
and the bouncing.

dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
which allows it to control scheduling as well as be easier to use.

>
>> - DMA with an associated transform (xor, byteswap); also requires
>> bouncing
>
> At this layer of the API, this is unnecessary.  At the PCI layer, you
> would need to handle this and I'd suggest taking the same approach as
> above.

I agree to _map()/_unmap(), but it's not a solution by itself.

>
>> In turn, bouncing requires splitting large requests to avoid
>> unbounded memory allocation.
>>
>> While I think _map/_unmap is an improvement over can_dma(), this API
>> can't handle bounded bouncing, and so a separate layer (dma.c) is
>> still necessary.
>
> I think it can handled bounded bouncing fine.  It's a matter of
> ensuring the bounce buffer max is sufficiently large and ensuring that
> any client code can cope with map failures.  In fact, there probably
> needs to be a notifiers API that is invoked whenever an unmap()
> happens so that if an asynchronous request is waiting because of a map
> failure, it can be notified as to when it should try again.

Right, but who would it notify?

We need some place that can deal with this, and it isn't
_map()/_unmap(), and it isn't ide.c or scsi.c.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 19:48             ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-13 19:48 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Anthony Liguori wrote:
>>
>> - DMA into mmio regions; this requires bouncing
>
> The map() API I proposed above should do bouncing to MMIO regions.  To
> deal with unbounded allocation, you can simply fail when the mapping
> allocation has reached some high limit.  Calling code needs to cope
> with the fact that map'ing may succeed or fail.

There are N users of this code, all of which would need to cope with the
failure.  Or there could be one user (dma.c) which handles the failure
and the bouncing.

dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
which allows it to control scheduling as well as be easier to use.

>
>> - DMA with an associated transform (xor, byteswap); also requires
>> bouncing
>
> At this layer of the API, this is unnecessary.  At the PCI layer, you
> would need to handle this and I'd suggest taking the same approach as
> above.

I agree to _map()/_unmap(), but it's not a solution by itself.

>
>> In turn, bouncing requires splitting large requests to avoid
>> unbounded memory allocation.
>>
>> While I think _map/_unmap is an improvement over can_dma(), this API
>> can't handle bounded bouncing, and so a separate layer (dma.c) is
>> still necessary.
>
> I think it can handled bounded bouncing fine.  It's a matter of
> ensuring the bounce buffer max is sufficiently large and ensuring that
> any client code can cope with map failures.  In fact, there probably
> needs to be a notifiers API that is invoked whenever an unmap()
> happens so that if an asynchronous request is waiting because of a map
> failure, it can be notified as to when it should try again.

Right, but who would it notify?

We need some place that can deal with this, and it isn't
_map()/_unmap(), and it isn't ide.c or scsi.c.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 19:48             ` Avi Kivity
@ 2008-12-13 21:07               ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 21:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>>> - DMA into mmio regions; this requires bouncing
>>>       
>> The map() API I proposed above should do bouncing to MMIO regions.  To
>> deal with unbounded allocation, you can simply fail when the mapping
>> allocation has reached some high limit.  Calling code needs to cope
>> with the fact that map'ing may succeed or fail.
>>     
>
> There are N users of this code, all of which would need to cope with the
> failure.  Or there could be one user (dma.c) which handles the failure
> and the bouncing.
>   

N should be small long term.  It should only be for places that would 
interact directly with CPU memory.  This would be the PCI BUS, the ISA 
BUS, some speciality devices, and possibly virtio (although you could 
argue it should go through the PCI BUS).

map() has to fail and that has nothing to do with bouncing or not 
bouncing.  In the case of Xen, you can have a guest that has 8GB of 
memory, and you only have 2GB of virtual address space.  If you try to 
DMA to more than 2GB of memory, there will be a failure.  Whoever is 
accessing memory directly in this fashion needs to cope with that.

> dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
> which allows it to control scheduling as well as be easier to use.
>   

As I understand dma.c, it performs the following action: map() as much 
as possible, call an actor on mapped memory, repeat until done, signal 
completion.

As an abstraction, it may be useful.  I would argue that it should be a 
bit more generic though.  It should take a function pointer for map and 
unmap too, and then you wouldn't need N versions of it for each 
different type of API.

> Right, but who would it notify?
>
> We need some place that can deal with this, and it isn't
> _map()/_unmap(), and it isn't ide.c or scsi.c.
>   

The pattern of try to map(), do IO, unmap(), repeat only really works 
for block IO.  It doesn't really work for network traffic.  You have to 
map the entire packet and send it all at once.  You cannot accept a 
partial mapping result.  The IO pattern to send an IO packet is much 
simpler: try to map the packet, if the mapping fails, either wait until 
more space frees up or drop the packet.  For the other uses of direct 
memory access, like kernel loading, the same is true.

What this is describing is not a DMA API.  It's a very specific IO 
pattern.  I think that's part of what's causing confusion in this 
series.  It's certainly not at all related to PCI DMA.

I would argue, that you really want to add a block driver interface that 
takes the necessary information, and implements this pattern but that's 
not important.  Reducing code duplication is a good thing so however it 
ends up working out is fine.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 21:07               ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 21:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>>> - DMA into mmio regions; this requires bouncing
>>>       
>> The map() API I proposed above should do bouncing to MMIO regions.  To
>> deal with unbounded allocation, you can simply fail when the mapping
>> allocation has reached some high limit.  Calling code needs to cope
>> with the fact that map'ing may succeed or fail.
>>     
>
> There are N users of this code, all of which would need to cope with the
> failure.  Or there could be one user (dma.c) which handles the failure
> and the bouncing.
>   

N should be small long term.  It should only be for places that would 
interact directly with CPU memory.  This would be the PCI BUS, the ISA 
BUS, some speciality devices, and possibly virtio (although you could 
argue it should go through the PCI BUS).

map() has to fail and that has nothing to do with bouncing or not 
bouncing.  In the case of Xen, you can have a guest that has 8GB of 
memory, and you only have 2GB of virtual address space.  If you try to 
DMA to more than 2GB of memory, there will be a failure.  Whoever is 
accessing memory directly in this fashion needs to cope with that.

> dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
> which allows it to control scheduling as well as be easier to use.
>   

As I understand dma.c, it performs the following action: map() as much 
as possible, call an actor on mapped memory, repeat until done, signal 
completion.

As an abstraction, it may be useful.  I would argue that it should be a 
bit more generic though.  It should take a function pointer for map and 
unmap too, and then you wouldn't need N versions of it for each 
different type of API.

> Right, but who would it notify?
>
> We need some place that can deal with this, and it isn't
> _map()/_unmap(), and it isn't ide.c or scsi.c.
>   

The pattern of try to map(), do IO, unmap(), repeat only really works 
for block IO.  It doesn't really work for network traffic.  You have to 
map the entire packet and send it all at once.  You cannot accept a 
partial mapping result.  The IO pattern to send an IO packet is much 
simpler: try to map the packet, if the mapping fails, either wait until 
more space frees up or drop the packet.  For the other uses of direct 
memory access, like kernel loading, the same is true.

What this is describing is not a DMA API.  It's a very specific IO 
pattern.  I think that's part of what's causing confusion in this 
series.  It's certainly not at all related to PCI DMA.

I would argue, that you really want to add a block driver interface that 
takes the necessary information, and implements this pattern but that's 
not important.  Reducing code duplication is a good thing so however it 
ends up working out is fine.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 16:53           ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-13 21:11             ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 21:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Andrea Arcangeli wrote:
> On Sat, Dec 13, 2008 at 10:46:49AM -0600, Anthony Liguori wrote:
>   
> Not sure why you think I'm suggesting you to use size_t. I'm just
> trying to tell you that if you insist in this
> 64bit-guest-on-32bit-host-is-dead-and-obsolete-to-support (i.e. if you
> pass a ram_addr_t size to cpu_physical_memory_map) you've at least to
> return ram_addr_t too). 'void *' is like size_t so the above API
> getting ram_addr_t length and returning 'void *', can't possibly be
> sane.
>   

If you take a size_t, then all callers have to validate that the size 
they're passing in (which may originate from the guest), is not going to 
cause an overflow.  You will naturally validate this in the map() 
function because you cannot map something that is greater than can fit 
in a void *.  All callers have to handle the case where return is NULL 
from map() which means that you can fold this error checking into map() 
without the callers having to even think of it.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 21:11             ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 21:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Andrea Arcangeli wrote:
> On Sat, Dec 13, 2008 at 10:46:49AM -0600, Anthony Liguori wrote:
>   
> Not sure why you think I'm suggesting you to use size_t. I'm just
> trying to tell you that if you insist in this
> 64bit-guest-on-32bit-host-is-dead-and-obsolete-to-support (i.e. if you
> pass a ram_addr_t size to cpu_physical_memory_map) you've at least to
> return ram_addr_t too). 'void *' is like size_t so the above API
> getting ram_addr_t length and returning 'void *', can't possibly be
> sane.
>   

If you take a size_t, then all callers have to validate that the size 
they're passing in (which may originate from the guest), is not going to 
cause an overflow.  You will naturally validate this in the map() 
function because you cannot map something that is greater than can fit 
in a void *.  All callers have to handle the case where return is NULL 
from map() which means that you can fold this error checking into map() 
without the callers having to even think of it.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:15     ` [Qemu-devel] " Anthony Liguori
@ 2008-12-13 22:47       ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 22:47 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

Anthony Liguori wrote:
>
> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

A really nice touch here, and note this is optional and can be a follow 
up series later, would be to use the mapping itself to encode the 
physical address and size so the signatures were:

void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
int flags);
void cpu_physical_memory_unmap(void *mapping);

flags could be PHYS_MAP_READ and/or PHYS_MAP_WRITE.

In unmap, you could check to see if the address is in phys_ram_base ... 
phys_ram_size.  If so, you can get the physical address.

If you maintained a list of mappings, you could then search the list of 
mappings based on the physical address and check the flags to see if a 
flush was required.

If you also stored the address in the list, you could search on unmap if 
the address was not in phys_ram_base .. phys_ram_size (which implies a 
bounce buffer).

Another potential optimization would be to provide a mechanism to 
explicitly set the dirty range of a physical mapping.  For instance:

cpu_physical_memory_map_set_dirty(void *start, ram_addr_t size);

That would let you only copy the data that actually needed to.  I think 
we can probably ignore this later optimization for a while though.

Regards,

Anthony Liguori

> The whole dma.c thing should not exist.  If we're going to introduce a 
> higher level API, it should be a PCI DMA API.
>
> Something like virtio could use this API directly seeing as it doesn't 
> really live on a PCI bus in real life.
>
> Regards,
>
> Anthony Liguori
>


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-13 22:47       ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-13 22:47 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

Anthony Liguori wrote:
>
> This could correspond to a:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int is_write);
>
> void cpu_physical_memory_unmap(target_physical_addr_t addr, ram_addr_t 
> size, void *mapping, int is_dirty);

A really nice touch here, and note this is optional and can be a follow 
up series later, would be to use the mapping itself to encode the 
physical address and size so the signatures were:

void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t size, 
int flags);
void cpu_physical_memory_unmap(void *mapping);

flags could be PHYS_MAP_READ and/or PHYS_MAP_WRITE.

In unmap, you could check to see if the address is in phys_ram_base ... 
phys_ram_size.  If so, you can get the physical address.

If you maintained a list of mappings, you could then search the list of 
mappings based on the physical address and check the flags to see if a 
flush was required.

If you also stored the address in the list, you could search on unmap if 
the address was not in phys_ram_base .. phys_ram_size (which implies a 
bounce buffer).

Another potential optimization would be to provide a mechanism to 
explicitly set the dirty range of a physical mapping.  For instance:

cpu_physical_memory_map_set_dirty(void *start, ram_addr_t size);

That would let you only copy the data that actually needed to.  I think 
we can probably ignore this later optimization for a while though.

Regards,

Anthony Liguori

> The whole dma.c thing should not exist.  If we're going to introduce a 
> higher level API, it should be a PCI DMA API.
>
> Something like virtio could use this API directly seeing as it doesn't 
> really live on a PCI bus in real life.
>
> Regards,
>
> Anthony Liguori
>

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 21:07               ` Anthony Liguori
@ 2008-12-14  6:03                 ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14  6:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Anthony Liguori wrote:
>>
>> There are N users of this code, all of which would need to cope with the
>> failure.  Or there could be one user (dma.c) which handles the failure
>> and the bouncing.
>>   
>
> N should be small long term.  It should only be for places that would 
> interact directly with CPU memory.  This would be the PCI BUS, the ISA 
> BUS, some speciality devices, and possibly virtio (although you could 
> argue it should go through the PCI BUS).

Fine, then let's rename it pci-dma.c.

>
> map() has to fail and that has nothing to do with bouncing or not 
> bouncing.  In the case of Xen, you can have a guest that has 8GB of 
> memory, and you only have 2GB of virtual address space.  If you try to 
> DMA to more than 2GB of memory, there will be a failure.  Whoever is 
> accessing memory directly in this fashion needs to cope with that.

The code already allows for failure, by partitioning the dma into 
segments.  Currently, this happens only on bounce buffer overflow, when 
the Xen code is integrated it can be expanded to accommodate this.

(There's a case for partitioning 2GB DMAs even without Xen; just to 
reduce the size of iovec allocations)

>
>> dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
>> which allows it to control scheduling as well as be easier to use.
>>   
>
> As I understand dma.c, it performs the following action: map() as much 
> as possible, call an actor on mapped memory, repeat until done, signal 
> completion.
>
> As an abstraction, it may be useful.  I would argue that it should be 
> a bit more generic though.  It should take a function pointer for map 
> and unmap too, and then you wouldn't need N versions of it for each 
> different type of API.

I don't follow.  What possible map/unmap pairs would it call, other than 
cpu_physical_memory_(map/unmap)()?

>
>> Right, but who would it notify?
>>
>> We need some place that can deal with this, and it isn't
>> _map()/_unmap(), and it isn't ide.c or scsi.c.
>>   
>
> The pattern of try to map(), do IO, unmap(), repeat only really works 
> for block IO.  It doesn't really work for network traffic.  You have 
> to map the entire packet and send it all at once.  You cannot accept a 
> partial mapping result.  The IO pattern to send an IO packet is much 
> simpler: try to map the packet, if the mapping fails, either wait 
> until more space frees up or drop the packet.  For the other uses of 
> direct memory access, like kernel loading, the same is true.

If so, the API should be extended to support more I/O patterns.

>
> What this is describing is not a DMA API.  It's a very specific IO 
> pattern.  I think that's part of what's causing confusion in this 
> series.  It's certainly not at all related to PCI DMA.

It deals with converting scatter/gather lists to iovecs, bouncing when 
this is not possible, and managing the bounce buffers.  If this is not 
dma, I'm not sure what is.  It certainly isn't part of block device 
emulation, it isn't part of the block layer (since bouncing is common to 
non-block devices).  What is it?

>
> I would argue, that you really want to add a block driver interface 
> that takes the necessary information, and implements this pattern but 
> that's not important.  Reducing code duplication is a good thing so 
> however it ends up working out is fine.

Right now the qemu block layer is totally independent of device 
emulation, and I think that's a good thing.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14  6:03                 ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14  6:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Anthony Liguori wrote:
>>
>> There are N users of this code, all of which would need to cope with the
>> failure.  Or there could be one user (dma.c) which handles the failure
>> and the bouncing.
>>   
>
> N should be small long term.  It should only be for places that would 
> interact directly with CPU memory.  This would be the PCI BUS, the ISA 
> BUS, some speciality devices, and possibly virtio (although you could 
> argue it should go through the PCI BUS).

Fine, then let's rename it pci-dma.c.

>
> map() has to fail and that has nothing to do with bouncing or not 
> bouncing.  In the case of Xen, you can have a guest that has 8GB of 
> memory, and you only have 2GB of virtual address space.  If you try to 
> DMA to more than 2GB of memory, there will be a failure.  Whoever is 
> accessing memory directly in this fashion needs to cope with that.

The code already allows for failure, by partitioning the dma into 
segments.  Currently, this happens only on bounce buffer overflow, when 
the Xen code is integrated it can be expanded to accommodate this.

(There's a case for partitioning 2GB DMAs even without Xen; just to 
reduce the size of iovec allocations)

>
>> dma.c _is_ a map/unmap api, except it doesn't expose the mapped data,
>> which allows it to control scheduling as well as be easier to use.
>>   
>
> As I understand dma.c, it performs the following action: map() as much 
> as possible, call an actor on mapped memory, repeat until done, signal 
> completion.
>
> As an abstraction, it may be useful.  I would argue that it should be 
> a bit more generic though.  It should take a function pointer for map 
> and unmap too, and then you wouldn't need N versions of it for each 
> different type of API.

I don't follow.  What possible map/unmap pairs would it call, other than 
cpu_physical_memory_(map/unmap)()?

>
>> Right, but who would it notify?
>>
>> We need some place that can deal with this, and it isn't
>> _map()/_unmap(), and it isn't ide.c or scsi.c.
>>   
>
> The pattern of try to map(), do IO, unmap(), repeat only really works 
> for block IO.  It doesn't really work for network traffic.  You have 
> to map the entire packet and send it all at once.  You cannot accept a 
> partial mapping result.  The IO pattern to send an IO packet is much 
> simpler: try to map the packet, if the mapping fails, either wait 
> until more space frees up or drop the packet.  For the other uses of 
> direct memory access, like kernel loading, the same is true.

If so, the API should be extended to support more I/O patterns.

>
> What this is describing is not a DMA API.  It's a very specific IO 
> pattern.  I think that's part of what's causing confusion in this 
> series.  It's certainly not at all related to PCI DMA.

It deals with converting scatter/gather lists to iovecs, bouncing when 
this is not possible, and managing the bounce buffers.  If this is not 
dma, I'm not sure what is.  It certainly isn't part of block device 
emulation, it isn't part of the block layer (since bouncing is common to 
non-block devices).  What is it?

>
> I would argue, that you really want to add a block driver interface 
> that takes the necessary information, and implements this pattern but 
> that's not important.  Reducing code duplication is a good thing so 
> however it ends up working out is fine.

Right now the qemu block layer is totally independent of device 
emulation, and I think that's a good thing.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 22:47       ` [Qemu-devel] " Anthony Liguori
@ 2008-12-14  6:07         ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14  6:07 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, Gerd Hoffmann, qemu-devel, kvm, chrisw

Anthony Liguori wrote:
> Anthony Liguori wrote:
>>
>> This could correspond to a:
>>
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>> size, int is_write);
>>
>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>> ram_addr_t size, void *mapping, int is_dirty);
>
> A really nice touch here, and note this is optional and can be a 
> follow up series later, would be to use the mapping itself to encode 
> the physical address and size so the signatures were:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int flags);
> void cpu_physical_memory_unmap(void *mapping);
>
> flags could be PHYS_MAP_READ and/or PHYS_MAP_WRITE.
>
> In unmap, you could check to see if the address is in phys_ram_base 
> ... phys_ram_size.  If so, you can get the physical address.
>
> If you maintained a list of mappings, you could then search the list 
> of mappings based on the physical address and check the flags to see 
> if a flush was required.


That will come back and bite us when we implement memory slots (for 
memory hotplug, etc.).  You want here to return two values - a pointer 
for the caller's use, and some information for unmap's use.  The best 
thing to do is return two values instead of having unmap() try to figure 
out the things that map() has already calculated.



-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14  6:07         ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14  6:07 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm, qemu-devel

Anthony Liguori wrote:
> Anthony Liguori wrote:
>>
>> This could correspond to a:
>>
>> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
>> size, int is_write);
>>
>> void cpu_physical_memory_unmap(target_physical_addr_t addr, 
>> ram_addr_t size, void *mapping, int is_dirty);
>
> A really nice touch here, and note this is optional and can be a 
> follow up series later, would be to use the mapping itself to encode 
> the physical address and size so the signatures were:
>
> void *cpu_physical_memory_map(target_phys_addr_t addr, ram_addr_t 
> size, int flags);
> void cpu_physical_memory_unmap(void *mapping);
>
> flags could be PHYS_MAP_READ and/or PHYS_MAP_WRITE.
>
> In unmap, you could check to see if the address is in phys_ram_base 
> ... phys_ram_size.  If so, you can get the physical address.
>
> If you maintained a list of mappings, you could then search the list 
> of mappings based on the physical address and check the flags to see 
> if a flush was required.


That will come back and bite us when we implement memory slots (for 
memory hotplug, etc.).  You want here to return two values - a pointer 
for the caller's use, and some information for unmap's use.  The best 
thing to do is return two values instead of having unmap() try to figure 
out the things that map() has already calculated.



-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-13 21:11             ` [Qemu-devel] " Anthony Liguori
@ 2008-12-14 16:47               ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-14 16:47 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm, avi, chrisw

On Sat, Dec 13, 2008 at 03:11:57PM -0600, Anthony Liguori wrote:
> cause an overflow.  You will naturally validate this in the map() function 
> because you cannot map something that is greater than can fit in a void *.  

When you told me to pass ram_addr_t instead of size_t in my patch, I
didn't mean it was just for validating that callers would comply with
the clear dma interface. With my patch I was going to truly support
dma operations larger than 4G on 32bit host and 64bit guest, but only
with mmio regions as destination, and with a max overhead of the
max-bounce-size of 1M.

To me map/unmap looks backwards. There's absolutely no point at all to
pretend that RAM isn't always mapped. Furthermore bouncing inside that
layer (at least with the api that you're proposing that can't handle
partial I/O and restart) is obviously broken design.

Once memory hotplug will emerge we've just to add a read write lock
before invoking can-dma/post-dma and stuff. There's no reason to ever
call anything after a read dma completed.

After my stuff would work, my next step would be to get rid entirely
of that per-page array that translates a ram_addr_t to a virtual
address and replace it with a rbtree of linear ranges, and then the
iovec would need to be passed down to exec.c so that it would be
filled with direct dma even if the whole range isn't linear. And in
average a single lookup of the tree would return us immediate
information.

I'm ok to support a not entirely flat ram space, but pretending to
support an API that requires to mangle host ptes (and sptes on kvm
case) every time there's a dma is entirely overkill and backwards,
besides preventing you to bounce sanely if you go over mmio regions
and preventing you as well to dma >4G space of mmio on 32bit build
with 64bit guest.

The whole concept of having to map something is flawed, there's
nothing to map. At most you've to take a read lock to prevent future
memory hotplug to change the memory layout from under you, but the
concept of mapping has nothing to do with that. RAM is always mapped,
and mmio has to be emulated anyway so it's worthless to map it.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 16:47               ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-14 16:47 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: chrisw, avi, Gerd Hoffmann, kvm, qemu-devel

On Sat, Dec 13, 2008 at 03:11:57PM -0600, Anthony Liguori wrote:
> cause an overflow.  You will naturally validate this in the map() function 
> because you cannot map something that is greater than can fit in a void *.  

When you told me to pass ram_addr_t instead of size_t in my patch, I
didn't mean it was just for validating that callers would comply with
the clear dma interface. With my patch I was going to truly support
dma operations larger than 4G on 32bit host and 64bit guest, but only
with mmio regions as destination, and with a max overhead of the
max-bounce-size of 1M.

To me map/unmap looks backwards. There's absolutely no point at all to
pretend that RAM isn't always mapped. Furthermore bouncing inside that
layer (at least with the api that you're proposing that can't handle
partial I/O and restart) is obviously broken design.

Once memory hotplug will emerge we've just to add a read write lock
before invoking can-dma/post-dma and stuff. There's no reason to ever
call anything after a read dma completed.

After my stuff would work, my next step would be to get rid entirely
of that per-page array that translates a ram_addr_t to a virtual
address and replace it with a rbtree of linear ranges, and then the
iovec would need to be passed down to exec.c so that it would be
filled with direct dma even if the whole range isn't linear. And in
average a single lookup of the tree would return us immediate
information.

I'm ok to support a not entirely flat ram space, but pretending to
support an API that requires to mangle host ptes (and sptes on kvm
case) every time there's a dma is entirely overkill and backwards,
besides preventing you to bounce sanely if you go over mmio regions
and preventing you as well to dma >4G space of mmio on 32bit build
with 64bit guest.

The whole concept of having to map something is flawed, there's
nothing to map. At most you've to take a read lock to prevent future
memory hotplug to change the memory layout from under you, but the
concept of mapping has nothing to do with that. RAM is always mapped,
and mmio has to be emulated anyway so it's worthless to map it.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 16:47               ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-14 17:01                 ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 17:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, Gerd Hoffmann, qemu-devel, kvm, chrisw

Andrea Arcangeli wrote:
> To me map/unmap looks backwards. There's absolutely no point at all to
> pretend that RAM isn't always mapped.

Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
runs on dom0 mode.  But I think map/unmap makes sense even disregarding 
Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
memory that has pending dma operations on it.  map/unmap gives us the 
opportunity to refcount memory slots.

> The whole concept of having to map something is flawed, there's
> nothing to map. At most you've to take a read lock to prevent future
> memory hotplug to change the memory layout from under you, but the
> concept of mapping has nothing to do with that. RAM is always mapped,
> and mmio has to be emulated anyway so it's worthless to map it.
>   

We can't get all dma to stop during hotunplug, since net rx operations 
are long-running (infinite if there is no activity on the link).

IMO, we do want map/unmap, but this would be just a rename of can_dma 
and friends, and wouldn't have at this time any additional 
functionality.  Bouncing has to happen where we have the ability to 
schedule the actual operation, and that's clearly not map/unmap.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 17:01                 ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 17:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, kvm, Gerd Hoffmann, qemu-devel

Andrea Arcangeli wrote:
> To me map/unmap looks backwards. There's absolutely no point at all to
> pretend that RAM isn't always mapped.

Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
runs on dom0 mode.  But I think map/unmap makes sense even disregarding 
Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
memory that has pending dma operations on it.  map/unmap gives us the 
opportunity to refcount memory slots.

> The whole concept of having to map something is flawed, there's
> nothing to map. At most you've to take a read lock to prevent future
> memory hotplug to change the memory layout from under you, but the
> concept of mapping has nothing to do with that. RAM is always mapped,
> and mmio has to be emulated anyway so it's worthless to map it.
>   

We can't get all dma to stop during hotunplug, since net rx operations 
are long-running (infinite if there is no activity on the link).

IMO, we do want map/unmap, but this would be just a rename of can_dma 
and friends, and wouldn't have at this time any additional 
functionality.  Bouncing has to happen where we have the ability to 
schedule the actual operation, and that's clearly not map/unmap.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 17:01                 ` [Qemu-devel] " Avi Kivity
@ 2008-12-14 17:15                   ` Andrea Arcangeli
  -1 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-14 17:15 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Anthony Liguori, Gerd Hoffmann, qemu-devel, kvm, chrisw

On Sun, Dec 14, 2008 at 07:01:38PM +0200, Avi Kivity wrote:
> Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
> runs on dom0 mode.  But I think map/unmap makes sense even disregarding 

I realize xen 32bit has issues... Qemu/KVM 32bit also has the same
issues but there's no point in 2009 (that's when this stuff could go
productive) in trying to run guests with >2G of ram on a 32bit
host. The issues emerges (I guess with xen too) in trying to run those
obsolete hardware configurations. Even the atom and extremely low
power athlon have 64bit capability, and on embedded that runs a real
32bit cpu I can't see how somebody would want to run a >2G guest.

> Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
> memory that has pending dma operations on it.  map/unmap gives us the 
> opportunity to refcount memory slots.

So memory hotunplug here is considered differently than the real
memory hotplug emulation that simulates removing dimm on the
hardware. This is just the xen trick to handle >4G guest on a 32bit
address space? Well that's just the thing I'm not interested to
support. When 64bit wasn't mainstream it made some sense, these days
it's good enough if we can boot any guest OS (including 64bit ones) on
a 32bit build, but trying to run guests OS with >2G of ram doesn't
look useful.

> We can't get all dma to stop during hotunplug, since net rx operations are 
> long-running (infinite if there is no activity on the link).
>
> IMO, we do want map/unmap, but this would be just a rename of can_dma and 
> friends, and wouldn't have at this time any additional functionality.  
> Bouncing has to happen where we have the ability to schedule the actual 
> operation, and that's clearly not map/unmap.

It would be a bit more than a rename, also keep in mind that in the
longer term as said we need to build the iovec in the exec.c path,
it's not enough to return a void *, I like to support a not 1:1 flat
space to avoid wasting host virtual address space with guest memory
holes. But that's about it, guest memory has to be always mapped, just
not with a 1:1 mapping, and surely not with a per-page array that
translates each page physical address to a host virtual address, but
with ranges. So this map thing that returns a 'void *' won't be there
for long even if I rename.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 17:15                   ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-14 17:15 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, kvm, Gerd Hoffmann, qemu-devel

On Sun, Dec 14, 2008 at 07:01:38PM +0200, Avi Kivity wrote:
> Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
> runs on dom0 mode.  But I think map/unmap makes sense even disregarding 

I realize xen 32bit has issues... Qemu/KVM 32bit also has the same
issues but there's no point in 2009 (that's when this stuff could go
productive) in trying to run guests with >2G of ram on a 32bit
host. The issues emerges (I guess with xen too) in trying to run those
obsolete hardware configurations. Even the atom and extremely low
power athlon have 64bit capability, and on embedded that runs a real
32bit cpu I can't see how somebody would want to run a >2G guest.

> Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
> memory that has pending dma operations on it.  map/unmap gives us the 
> opportunity to refcount memory slots.

So memory hotunplug here is considered differently than the real
memory hotplug emulation that simulates removing dimm on the
hardware. This is just the xen trick to handle >4G guest on a 32bit
address space? Well that's just the thing I'm not interested to
support. When 64bit wasn't mainstream it made some sense, these days
it's good enough if we can boot any guest OS (including 64bit ones) on
a 32bit build, but trying to run guests OS with >2G of ram doesn't
look useful.

> We can't get all dma to stop during hotunplug, since net rx operations are 
> long-running (infinite if there is no activity on the link).
>
> IMO, we do want map/unmap, but this would be just a rename of can_dma and 
> friends, and wouldn't have at this time any additional functionality.  
> Bouncing has to happen where we have the ability to schedule the actual 
> operation, and that's clearly not map/unmap.

It would be a bit more than a rename, also keep in mind that in the
longer term as said we need to build the iovec in the exec.c path,
it's not enough to return a void *, I like to support a not 1:1 flat
space to avoid wasting host virtual address space with guest memory
holes. But that's about it, guest memory has to be always mapped, just
not with a 1:1 mapping, and surely not with a per-page array that
translates each page physical address to a host virtual address, but
with ranges. So this map thing that returns a 'void *' won't be there
for long even if I rename.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-12 19:39       ` [Qemu-devel] " Anthony Liguori
@ 2008-12-14 17:30         ` Jamie Lokier
  -1 siblings, 0 replies; 111+ messages in thread
From: Jamie Lokier @ 2008-12-14 17:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, avi, Gerd Hoffmann, kvm

Anthony Liguori wrote:
> Let me clarify this a bit more.  The problem we're trying to address 
> today is the encapsulating knowledge of phys_ram_base.  We want to 
> minimize the amount of code that makes any assumptions about 
> phys_ram_base.  Your current API still accesses phys_ram_base directly 
> in the PCI DMA API.  The only real improvement compared to the current 
> virtio code is that you properly handle MMIO.  This is not just about 
> layout but this also includes the fact that in the future, guest memory 
> could be discontiguous in QEMU (think memory hotplug).

One little extra thing:

What about PCI DMA to another PCI device, not to RAM?

That's not used very often, but it ought to work.

-- Jamie

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 17:30         ` Jamie Lokier
  0 siblings, 0 replies; 111+ messages in thread
From: Jamie Lokier @ 2008-12-14 17:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, avi, kvm, Gerd Hoffmann

Anthony Liguori wrote:
> Let me clarify this a bit more.  The problem we're trying to address 
> today is the encapsulating knowledge of phys_ram_base.  We want to 
> minimize the amount of code that makes any assumptions about 
> phys_ram_base.  Your current API still accesses phys_ram_base directly 
> in the PCI DMA API.  The only real improvement compared to the current 
> virtio code is that you properly handle MMIO.  This is not just about 
> layout but this also includes the fact that in the future, guest memory 
> could be discontiguous in QEMU (think memory hotplug).

One little extra thing:

What about PCI DMA to another PCI device, not to RAM?

That's not used very often, but it ought to work.

-- Jamie

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14  6:03                 ` Avi Kivity
@ 2008-12-14 19:10                   ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-14 19:10 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>
>>> There are N users of this code, all of which would need to cope with 
>>> the
>>> failure.  Or there could be one user (dma.c) which handles the failure
>>> and the bouncing.
>>>   
>>
>> N should be small long term.  It should only be for places that would 
>> interact directly with CPU memory.  This would be the PCI BUS, the 
>> ISA BUS, some speciality devices, and possibly virtio (although you 
>> could argue it should go through the PCI BUS).
>
> Fine, then let's rename it pci-dma.c.

Better, but then it should be part of pci.c.  I've thought quite a bit 
about it, and I'm becoming less convinced that this sort of API is going 
to be helpful.

I was thinking that we need to make one minor change to the map API I 
proposed.  It should return a mapped size as an output parameter and 
take a flag as to whether partial mappings can be handled.  The effect 
would be that you never bounce to RAM which means that you can also 
quite accurately determine the maximum amount of bouncing (it should be 
proportional to the amount of MMIO memory that's registered).

For virtio, I think the only change that we would make it to replace the 
existing bouncing code with calls to map()/unmap() for each 
scatter/gather array.  You may need to cope with a scatter/gather list 
that is larger than the inputted vector but that's easy.

This means you always map complete requests which is where I think it 
fundamentally differs from what you propose.  However, this is safe 
because you should be able to guarantee that you can always bounce the 
MMIO memory at least once.  The only place where you could run into 
trouble is if the same MMIO page is mapped multiple times in a single 
scatter/gather.  You could, in theory, attempt to cache MMIO bouncing 
but that's probably unnecessary.

So virtio would not need this dma API.

I think the same is true about IDE/SCSI.  I think we can built code that 
relies on being able to completely map a scatter/gather list as long as 
we make the map function sufficiently smart.  As long as our bounce pool 
is as large as the registered MMIO memory, we can always be safe is 
we're sufficiently clever.

Xen is a more difficult case but I'm perfectly willing to punt that to 
them to figure out.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 19:10                   ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-14 19:10 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>
>>> There are N users of this code, all of which would need to cope with 
>>> the
>>> failure.  Or there could be one user (dma.c) which handles the failure
>>> and the bouncing.
>>>   
>>
>> N should be small long term.  It should only be for places that would 
>> interact directly with CPU memory.  This would be the PCI BUS, the 
>> ISA BUS, some speciality devices, and possibly virtio (although you 
>> could argue it should go through the PCI BUS).
>
> Fine, then let's rename it pci-dma.c.

Better, but then it should be part of pci.c.  I've thought quite a bit 
about it, and I'm becoming less convinced that this sort of API is going 
to be helpful.

I was thinking that we need to make one minor change to the map API I 
proposed.  It should return a mapped size as an output parameter and 
take a flag as to whether partial mappings can be handled.  The effect 
would be that you never bounce to RAM which means that you can also 
quite accurately determine the maximum amount of bouncing (it should be 
proportional to the amount of MMIO memory that's registered).

For virtio, I think the only change that we would make it to replace the 
existing bouncing code with calls to map()/unmap() for each 
scatter/gather array.  You may need to cope with a scatter/gather list 
that is larger than the inputted vector but that's easy.

This means you always map complete requests which is where I think it 
fundamentally differs from what you propose.  However, this is safe 
because you should be able to guarantee that you can always bounce the 
MMIO memory at least once.  The only place where you could run into 
trouble is if the same MMIO page is mapped multiple times in a single 
scatter/gather.  You could, in theory, attempt to cache MMIO bouncing 
but that's probably unnecessary.

So virtio would not need this dma API.

I think the same is true about IDE/SCSI.  I think we can built code that 
relies on being able to completely map a scatter/gather list as long as 
we make the map function sufficiently smart.  As long as our bounce pool 
is as large as the registered MMIO memory, we can always be safe is 
we're sufficiently clever.

Xen is a more difficult case but I'm perfectly willing to punt that to 
them to figure out.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 19:10                   ` Anthony Liguori
@ 2008-12-14 19:49                     ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 19:49 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Anthony Liguori wrote:
> I've thought quite a bit about it, and I'm becoming less convinced 
> that this sort of API is going to be helpful.
>
> I was thinking that we need to make one minor change to the map API I 
> proposed.  It should return a mapped size as an output parameter and 
> take a flag as to whether partial mappings can be handled.  The effect 
> would be that you never bounce to RAM which means that you can also 
> quite accurately determine the maximum amount of bouncing (it should 
> be proportional to the amount of MMIO memory that's registered).

That's pointless; cirrus for example has 8MB of mmio while a cpu-to-vram 
blit is in progress, and some random device we'll add tomorrow could 
easily introduce more.  Our APIs shouldn't depend on properties of 
emulated hardware, at least as much as possible.

Of course you don't need the API if you replicate its functionality in 3 
block devices (+ out of tree), 4+ network devices (+1 out of tree), and 
any other new dma client that comes along.  But I think that coding the 
bouncing and rescheduling logic in all of those devices is a mistake, 
for reasons I don't even wish to point out.

Just as an example: as presented, the maximum bounce memory allocated is 
equal to the number of concurrent requests issues by all dma capable 
devices multiplied by the maximum dma buffer size.  Should we consider 
this to be a deficiency (and I do), it is easy to modify dma.c to limit 
the amount of allocated memory and defer requests when we are out.  It 
would be a horrible pain to do this using map()/unmap() alone.

The logic in dma.c cannot be made to go away.  You can either put it in 
the dma clients, or in map()/unmap(), or leave it in dma.c.  While 
putting it in map()/unmap() is possible (though undesirable IMO), 
letting the dma clients handle these details is a big mistake.

I'll enumerate the functions that dma.c provides:
- convert guest physical addresses to host virtual addresses
- construct an iovec for scatter/gather
- handle guest physical addresses for which no host virtual addresses 
exist, while controlling memory use
- take care of the dirty bit
- provide a place for adding hooks to hardware that can modify dma 
operations generically (emulated iommus, transforming dma engines)

I believe that a dma api that fails to address all of these requirements 
is trying to solve too few problems at the same time, and will either 
cause dma clients to be unduly complicated, or will require rewriting.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 19:49                     ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 19:49 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Anthony Liguori wrote:
> I've thought quite a bit about it, and I'm becoming less convinced 
> that this sort of API is going to be helpful.
>
> I was thinking that we need to make one minor change to the map API I 
> proposed.  It should return a mapped size as an output parameter and 
> take a flag as to whether partial mappings can be handled.  The effect 
> would be that you never bounce to RAM which means that you can also 
> quite accurately determine the maximum amount of bouncing (it should 
> be proportional to the amount of MMIO memory that's registered).

That's pointless; cirrus for example has 8MB of mmio while a cpu-to-vram 
blit is in progress, and some random device we'll add tomorrow could 
easily introduce more.  Our APIs shouldn't depend on properties of 
emulated hardware, at least as much as possible.

Of course you don't need the API if you replicate its functionality in 3 
block devices (+ out of tree), 4+ network devices (+1 out of tree), and 
any other new dma client that comes along.  But I think that coding the 
bouncing and rescheduling logic in all of those devices is a mistake, 
for reasons I don't even wish to point out.

Just as an example: as presented, the maximum bounce memory allocated is 
equal to the number of concurrent requests issues by all dma capable 
devices multiplied by the maximum dma buffer size.  Should we consider 
this to be a deficiency (and I do), it is easy to modify dma.c to limit 
the amount of allocated memory and defer requests when we are out.  It 
would be a horrible pain to do this using map()/unmap() alone.

The logic in dma.c cannot be made to go away.  You can either put it in 
the dma clients, or in map()/unmap(), or leave it in dma.c.  While 
putting it in map()/unmap() is possible (though undesirable IMO), 
letting the dma clients handle these details is a big mistake.

I'll enumerate the functions that dma.c provides:
- convert guest physical addresses to host virtual addresses
- construct an iovec for scatter/gather
- handle guest physical addresses for which no host virtual addresses 
exist, while controlling memory use
- take care of the dirty bit
- provide a place for adding hooks to hardware that can modify dma 
operations generically (emulated iommus, transforming dma engines)

I believe that a dma api that fails to address all of these requirements 
is trying to solve too few problems at the same time, and will either 
cause dma clients to be unduly complicated, or will require rewriting.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 17:15                   ` [Qemu-devel] " Andrea Arcangeli
@ 2008-12-14 19:59                     ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 19:59 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, Gerd Hoffmann, qemu-devel, kvm, chrisw

Andrea Arcangeli wrote:
> On Sun, Dec 14, 2008 at 07:01:38PM +0200, Avi Kivity wrote:
>   
>> Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
>> runs on dom0 mode.  But I think map/unmap makes sense even disregarding 
>>     
>
> I realize xen 32bit has issues... Qemu/KVM 32bit also has the same
> issues but there's no point in 2009 (that's when this stuff could go
> productive) in trying to run guests with >2G of ram on a 32bit
> host. The issues emerges (I guess with xen too) in trying to run those
> obsolete hardware configurations. Even the atom and extremely low
> power athlon have 64bit capability, and on embedded that runs a real
> 32bit cpu I can't see how somebody would want to run a >2G guest.
>   

kvm and Xen actually have different issues for 32-bit.  For kvm, 
supporting >2G on 32-bits is possible but messy and pointless, so we 
chose not to do it.  For Xen, this is a critical performance issue as 
64-bit userspace in pv guests is quite slow.  So dom0 runs as a 32-bit guest

Newer Xen shouldn't have this problem though; it runs qemu in kernel 
mode in a dedicated 64-bit domain.

>> Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
>> memory that has pending dma operations on it.  map/unmap gives us the 
>> opportunity to refcount memory slots.
>>     
>
> So memory hotunplug here is considered differently than the real
> memory hotplug emulation that simulates removing dimm on the
> hardware. This is just the xen trick to handle >4G guest on a 32bit
> address space? Well that's just the thing I'm not interested to
> support. When 64bit wasn't mainstream it made some sense, these days
> it's good enough if we can boot any guest OS (including 64bit ones) on
> a 32bit build, but trying to run guests OS with >2G of ram doesn't
> look useful.
>   

Leaving Xen aside, memory hotunplug requires that we tell the memory 
when it's used and when it isn't.

>> We can't get all dma to stop during hotunplug, since net rx operations are 
>> long-running (infinite if there is no activity on the link).
>>
>> IMO, we do want map/unmap, but this would be just a rename of can_dma and 
>> friends, and wouldn't have at this time any additional functionality.  
>> Bouncing has to happen where we have the ability to schedule the actual 
>> operation, and that's clearly not map/unmap.
>>     
>
> It would be a bit more than a rename, also keep in mind that in the
> longer term as said we need to build the iovec in the exec.c path,
> it's not enough to return a void *, I like to support a not 1:1 flat
> space to avoid wasting host virtual address space with guest memory
> holes. But that's about it, guest memory has to be always mapped, just
> not with a 1:1 mapping, and surely not with a per-page array that
> translates each page physical address to a host virtual address, but
> with ranges. So this map thing that returns a 'void *' won't be there
> for long even if I rename.
>   

If it returns an iovec, still that doesn't change how it works.  I like 
the symmetry of map()/unmap() and the lock/unlock semantics (like 
kmap_atomic/kunmap_atomic and a myriad other get/put pairs).

[There's actually a language that supports this idiom, but that's a 
different flamewar]

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 19:59                     ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-14 19:59 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chrisw, kvm, Gerd Hoffmann, qemu-devel

Andrea Arcangeli wrote:
> On Sun, Dec 14, 2008 at 07:01:38PM +0200, Avi Kivity wrote:
>   
>> Actually, with Xen, RAM may be unmapped due do Xen limitations when qemu 
>> runs on dom0 mode.  But I think map/unmap makes sense even disregarding 
>>     
>
> I realize xen 32bit has issues... Qemu/KVM 32bit also has the same
> issues but there's no point in 2009 (that's when this stuff could go
> productive) in trying to run guests with >2G of ram on a 32bit
> host. The issues emerges (I guess with xen too) in trying to run those
> obsolete hardware configurations. Even the atom and extremely low
> power athlon have 64bit capability, and on embedded that runs a real
> 32bit cpu I can't see how somebody would want to run a >2G guest.
>   

kvm and Xen actually have different issues for 32-bit.  For kvm, 
supporting >2G on 32-bits is possible but messy and pointless, so we 
chose not to do it.  For Xen, this is a critical performance issue as 
64-bit userspace in pv guests is quite slow.  So dom0 runs as a 32-bit guest

Newer Xen shouldn't have this problem though; it runs qemu in kernel 
mode in a dedicated 64-bit domain.

>> Xen:  if we add memory hotunplug, we need to make sure we don't unplug 
>> memory that has pending dma operations on it.  map/unmap gives us the 
>> opportunity to refcount memory slots.
>>     
>
> So memory hotunplug here is considered differently than the real
> memory hotplug emulation that simulates removing dimm on the
> hardware. This is just the xen trick to handle >4G guest on a 32bit
> address space? Well that's just the thing I'm not interested to
> support. When 64bit wasn't mainstream it made some sense, these days
> it's good enough if we can boot any guest OS (including 64bit ones) on
> a 32bit build, but trying to run guests OS with >2G of ram doesn't
> look useful.
>   

Leaving Xen aside, memory hotunplug requires that we tell the memory 
when it's used and when it isn't.

>> We can't get all dma to stop during hotunplug, since net rx operations are 
>> long-running (infinite if there is no activity on the link).
>>
>> IMO, we do want map/unmap, but this would be just a rename of can_dma and 
>> friends, and wouldn't have at this time any additional functionality.  
>> Bouncing has to happen where we have the ability to schedule the actual 
>> operation, and that's clearly not map/unmap.
>>     
>
> It would be a bit more than a rename, also keep in mind that in the
> longer term as said we need to build the iovec in the exec.c path,
> it's not enough to return a void *, I like to support a not 1:1 flat
> space to avoid wasting host virtual address space with guest memory
> holes. But that's about it, guest memory has to be always mapped, just
> not with a 1:1 mapping, and surely not with a per-page array that
> translates each page physical address to a host virtual address, but
> with ranges. So this map thing that returns a 'void *' won't be there
> for long even if I rename.
>   

If it returns an iovec, still that doesn't change how it works.  I like 
the symmetry of map()/unmap() and the lock/unlock semantics (like 
kmap_atomic/kunmap_atomic and a myriad other get/put pairs).

[There's actually a language that supports this idiom, but that's a 
different flamewar]

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 19:49                     ` Avi Kivity
@ 2008-12-14 23:08                       ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-14 23:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>> I've thought quite a bit about it, and I'm becoming less convinced 
>> that this sort of API is going to be helpful.
>>
>> I was thinking that we need to make one minor change to the map API I 
>> proposed.  It should return a mapped size as an output parameter and 
>> take a flag as to whether partial mappings can be handled.  The 
>> effect would be that you never bounce to RAM which means that you can 
>> also quite accurately determine the maximum amount of bouncing (it 
>> should be proportional to the amount of MMIO memory that's registered).
>
> That's pointless; cirrus for example has 8MB of mmio while a 
> cpu-to-vram blit is in progress, and some random device we'll add 
> tomorrow could easily introduce more.  Our APIs shouldn't depend on 
> properties of emulated hardware, at least as much as possible.

One way to think of what I'm suggesting, is that if for every 
cpu_register_physical_memory call for MMIO, we allocated a buffer, then 
whenever map() was called on MMIO, we would return that already 
allocated buffer.  The overhead is fixed and honestly relatively small.  
Much smaller than dma.c proposes.

But you can be smarter, and lazily allocate those buffers for MMIO.  
That will reduce the up front memory consumption.  I'd be perfectly 
happy though if the first implementation just malloc() on 
cpu_register_physical_memory for the sake of simplicity.

> I'll enumerate the functions that dma.c provides:
> - convert guest physical addresses to host virtual addresses

The map() api does this.

> - construct an iovec for scatter/gather

The map() api does not do this.

> - handle guest physical addresses for which no host virtual addresses 
> exist, while controlling memory use
> - take care of the dirty bit
> - provide a place for adding hooks to hardware that can modify dma 
> operations generically (emulated iommus, transforming dma engines)

The map() api does all of this.

> I believe that a dma api that fails to address all of these 
> requirements is trying to solve too few problems at the same time, and 
> will either cause dma clients to be unduly complicated, or will 
> require rewriting.

I think there's a disconnect between what you describe and what the 
current code does.  I think there's a very simple solution, let's start 
with the map() api.  I'm convinced that the virtio support for it will 
be trivial and that virtio will not benefit from the dma.c api 
proposed.  I'm willing to do the virtio map implementation to 
demonstrate this.  Let's see two implementations that use the dma.c api 
before we commit to it.  I'd like to see at least a network device and a 
block device.  I don't believe network devices will benefit from it 
because they don't support partial submissions.

I like any API that reduces duplicate code.  It's easy to demonstrate 
that with patches.  Based on my current understanding of the API and 
what I expect from the devices using it, I don't believe the API will 
actually do that.  It's quite easy to prove me wrong though.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-14 23:08                       ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-14 23:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Avi Kivity wrote:
> Anthony Liguori wrote:
>> I've thought quite a bit about it, and I'm becoming less convinced 
>> that this sort of API is going to be helpful.
>>
>> I was thinking that we need to make one minor change to the map API I 
>> proposed.  It should return a mapped size as an output parameter and 
>> take a flag as to whether partial mappings can be handled.  The 
>> effect would be that you never bounce to RAM which means that you can 
>> also quite accurately determine the maximum amount of bouncing (it 
>> should be proportional to the amount of MMIO memory that's registered).
>
> That's pointless; cirrus for example has 8MB of mmio while a 
> cpu-to-vram blit is in progress, and some random device we'll add 
> tomorrow could easily introduce more.  Our APIs shouldn't depend on 
> properties of emulated hardware, at least as much as possible.

One way to think of what I'm suggesting, is that if for every 
cpu_register_physical_memory call for MMIO, we allocated a buffer, then 
whenever map() was called on MMIO, we would return that already 
allocated buffer.  The overhead is fixed and honestly relatively small.  
Much smaller than dma.c proposes.

But you can be smarter, and lazily allocate those buffers for MMIO.  
That will reduce the up front memory consumption.  I'd be perfectly 
happy though if the first implementation just malloc() on 
cpu_register_physical_memory for the sake of simplicity.

> I'll enumerate the functions that dma.c provides:
> - convert guest physical addresses to host virtual addresses

The map() api does this.

> - construct an iovec for scatter/gather

The map() api does not do this.

> - handle guest physical addresses for which no host virtual addresses 
> exist, while controlling memory use
> - take care of the dirty bit
> - provide a place for adding hooks to hardware that can modify dma 
> operations generically (emulated iommus, transforming dma engines)

The map() api does all of this.

> I believe that a dma api that fails to address all of these 
> requirements is trying to solve too few problems at the same time, and 
> will either cause dma clients to be unduly complicated, or will 
> require rewriting.

I think there's a disconnect between what you describe and what the 
current code does.  I think there's a very simple solution, let's start 
with the map() api.  I'm convinced that the virtio support for it will 
be trivial and that virtio will not benefit from the dma.c api 
proposed.  I'm willing to do the virtio map implementation to 
demonstrate this.  Let's see two implementations that use the dma.c api 
before we commit to it.  I'd like to see at least a network device and a 
block device.  I don't believe network devices will benefit from it 
because they don't support partial submissions.

I like any API that reduces duplicate code.  It's easy to demonstrate 
that with patches.  Based on my current understanding of the API and 
what I expect from the devices using it, I don't believe the API will 
actually do that.  It's quite easy to prove me wrong though.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 23:08                       ` Anthony Liguori
@ 2008-12-15  0:57                         ` Paul Brook
  -1 siblings, 0 replies; 111+ messages in thread
From: Paul Brook @ 2008-12-15  0:57 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, Avi Kivity, Andrea Arcangeli, chrisw, kvm,
	Gerd Hoffmann

> > That's pointless; cirrus for example has 8MB of mmio while a
> > cpu-to-vram blit is in progress, and some random device we'll add
> > tomorrow could easily introduce more.  Our APIs shouldn't depend on
> > properties of emulated hardware, at least as much as possible.
>
> One way to think of what I'm suggesting, is that if for every
> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
> whenever map() was called on MMIO, we would return that already
> allocated buffer.  The overhead is fixed and honestly relatively small.
> Much smaller than dma.c proposes.

I Wouldn't be surprised if some machines had a large memory mapped IO space.  
Most of it might not be actively used, but once you start considering 64-bit 
machines on 32-bit hosts these allocations would become problematic.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-15  0:57                         ` Paul Brook
  0 siblings, 0 replies; 111+ messages in thread
From: Paul Brook @ 2008-12-15  0:57 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann, Avi Kivity

> > That's pointless; cirrus for example has 8MB of mmio while a
> > cpu-to-vram blit is in progress, and some random device we'll add
> > tomorrow could easily introduce more.  Our APIs shouldn't depend on
> > properties of emulated hardware, at least as much as possible.
>
> One way to think of what I'm suggesting, is that if for every
> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
> whenever map() was called on MMIO, we would return that already
> allocated buffer.  The overhead is fixed and honestly relatively small.
> Much smaller than dma.c proposes.

I Wouldn't be surprised if some machines had a large memory mapped IO space.  
Most of it might not be actively used, but once you start considering 64-bit 
machines on 32-bit hosts these allocations would become problematic.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-15  0:57                         ` Paul Brook
@ 2008-12-15  2:09                           ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-15  2:09 UTC (permalink / raw)
  To: Paul Brook
  Cc: qemu-devel, Avi Kivity, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Paul Brook wrote:
>>> That's pointless; cirrus for example has 8MB of mmio while a
>>> cpu-to-vram blit is in progress, and some random device we'll add
>>> tomorrow could easily introduce more.  Our APIs shouldn't depend on
>>> properties of emulated hardware, at least as much as possible.
>>>       
>> One way to think of what I'm suggesting, is that if for every
>> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
>> whenever map() was called on MMIO, we would return that already
>> allocated buffer.  The overhead is fixed and honestly relatively small.
>> Much smaller than dma.c proposes.
>>     
>
> I Wouldn't be surprised if some machines had a large memory mapped IO space.  
> Most of it might not be actively used, but once you start considering 64-bit 
> machines on 32-bit hosts these allocations would become problematic.
>   

At some level, any DMA API would fall apart here.  Certain IO operations 
simply cannot be split (such as network send).  If you pass don't pass a 
flag to the map function that says you can handle partial results, you 
must cope with map() returning NULL (by dropping the packet).

Since MMIO space is something we control as we implement device 
emulation, we can assert that this isn't a problem today, and then deal 
with that particular special case when we come to it.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-15  2:09                           ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-15  2:09 UTC (permalink / raw)
  To: Paul Brook
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Avi Kivity

Paul Brook wrote:
>>> That's pointless; cirrus for example has 8MB of mmio while a
>>> cpu-to-vram blit is in progress, and some random device we'll add
>>> tomorrow could easily introduce more.  Our APIs shouldn't depend on
>>> properties of emulated hardware, at least as much as possible.
>>>       
>> One way to think of what I'm suggesting, is that if for every
>> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
>> whenever map() was called on MMIO, we would return that already
>> allocated buffer.  The overhead is fixed and honestly relatively small.
>> Much smaller than dma.c proposes.
>>     
>
> I Wouldn't be surprised if some machines had a large memory mapped IO space.  
> Most of it might not be actively used, but once you start considering 64-bit 
> machines on 32-bit hosts these allocations would become problematic.
>   

At some level, any DMA API would fall apart here.  Certain IO operations 
simply cannot be split (such as network send).  If you pass don't pass a 
flag to the map function that says you can handle partial results, you 
must cope with map() returning NULL (by dropping the packet).

Since MMIO space is something we control as we implement device 
emulation, we can assert that this isn't a problem today, and then deal 
with that particular special case when we come to it.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-15  2:09                           ` Anthony Liguori
@ 2008-12-15  6:23                             ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-15  6:23 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Paul Brook, qemu-devel, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Anthony Liguori wrote:
>>
>> I Wouldn't be surprised if some machines had a large memory mapped IO 
>> space.  Most of it might not be actively used, but once you start 
>> considering 64-bit machines on 32-bit hosts these allocations would 
>> become problematic.
>>   
>
> At some level, any DMA API would fall apart here.  Certain IO 
> operations simply cannot be split (such as network send).  If you pass 
> don't pass a flag to the map function that says you can handle partial 
> results, you must cope with map() returning NULL (by dropping the 
> packet).
>

... and then you must cope with it.  In all non-network dma clients.  Or 
in the dma api.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-15  6:23                             ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-15  6:23 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Paul Brook

Anthony Liguori wrote:
>>
>> I Wouldn't be surprised if some machines had a large memory mapped IO 
>> space.  Most of it might not be actively used, but once you start 
>> considering 64-bit machines on 32-bit hosts these allocations would 
>> become problematic.
>>   
>
> At some level, any DMA API would fall apart here.  Certain IO 
> operations simply cannot be split (such as network send).  If you pass 
> don't pass a flag to the map function that says you can handle partial 
> results, you must cope with map() returning NULL (by dropping the 
> packet).
>

... and then you must cope with it.  In all non-network dma clients.  Or 
in the dma api.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 23:08                       ` Anthony Liguori
@ 2008-12-15 18:35                         ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-15 18:35 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori
  Cc: Avi Kivity, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

[-- Attachment #1: Type: text/plain, Size: 4357 bytes --]

On 12/15/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Avi Kivity wrote:
>
> > Anthony Liguori wrote:
> >
> > > I've thought quite a bit about it, and I'm becoming less convinced that
> this sort of API is going to be helpful.
> > >
> > > I was thinking that we need to make one minor change to the map API I
> proposed.  It should return a mapped size as an output parameter and take a
> flag as to whether partial mappings can be handled.  The effect would be
> that you never bounce to RAM which means that you can also quite accurately
> determine the maximum amount of bouncing (it should be proportional to the
> amount of MMIO memory that's registered).
> > >
> >
> > That's pointless; cirrus for example has 8MB of mmio while a cpu-to-vram
> blit is in progress, and some random device we'll add tomorrow could easily
> introduce more.  Our APIs shouldn't depend on properties of emulated
> hardware, at least as much as possible.
> >
>
>  One way to think of what I'm suggesting, is that if for every
> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
> whenever map() was called on MMIO, we would return that already allocated
> buffer.  The overhead is fixed and honestly relatively small.  Much smaller
> than dma.c proposes.
>
>  But you can be smarter, and lazily allocate those buffers for MMIO.  That
> will reduce the up front memory consumption.  I'd be perfectly happy though
> if the first implementation just malloc() on cpu_register_physical_memory
> for the sake of simplicity.

Maybe the buffer thing should not be called DMA, but something like
bounce-buffers?

> > I'll enumerate the functions that dma.c provides:
> > - convert guest physical addresses to host virtual addresses
> >
>
>  The map() api does this.
>
>
> > - construct an iovec for scatter/gather
> >
>
>  The map() api does not do this.
>
>
> > - handle guest physical addresses for which no host virtual addresses
> exist, while controlling memory use
> > - take care of the dirty bit
> > - provide a place for adding hooks to hardware that can modify dma
> operations generically (emulated iommus, transforming dma engines)
> >
>
>  The map() api does all of this.
>
>
> > I believe that a dma api that fails to address all of these requirements
> is trying to solve too few problems at the same time, and will either cause
> dma clients to be unduly complicated, or will require rewriting.
> >
>
>  I think there's a disconnect between what you describe and what the current
> code does.  I think there's a very simple solution, let's start with the
> map() api.  I'm convinced that the virtio support for it will be trivial and
> that virtio will not benefit from the dma.c api proposed.  I'm willing to do
> the virtio map implementation to demonstrate this.  Let's see two
> implementations that use the dma.c api before we commit to it.  I'd like to
> see at least a network device and a block device.  I don't believe network
> devices will benefit from it because they don't support partial submissions.
>
>  I like any API that reduces duplicate code.  It's easy to demonstrate that
> with patches.  Based on my current understanding of the API and what I
> expect from the devices using it, I don't believe the API will actually do
> that.  It's quite easy to prove me wrong though.

I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
address to physical memory (see patch). ESP works (no zero copy yet),
Lance doesn't. It looks much better. Because the resolving activity is
performed in serial steps, unbounded IO vector allocation does not
happen, but we still could launch as many IO as there are free IO
vectors.

There are still some issues I'm not happy yet:
- handling of access violations: resolving should stop before the bad
page, the transfers should be done until that and then post error.
- bounce buffers needed for Lance byte swapping are not well designed (stack)

This lead me to the thought that maybe we should not hide the bounce
buffer activity, but instead make it more explicit for the device that
needs bouncing. For the other device, the buffering or lack of it
should be opaque.

Also the virtual-to-physical address resolution API could be generic,
ie all resolver functions should take same parameters so that the
devices would not need to know the next higher level device.

[-- Attachment #2: translation-api-sparc.diff --]
[-- Type: plain/text, Size: 13400 bytes --]

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-15 18:35                         ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-15 18:35 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori
  Cc: Andrea Arcangeli, chrisw, Avi Kivity, kvm, Gerd Hoffmann

[-- Attachment #1: Type: text/plain, Size: 4357 bytes --]

On 12/15/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Avi Kivity wrote:
>
> > Anthony Liguori wrote:
> >
> > > I've thought quite a bit about it, and I'm becoming less convinced that
> this sort of API is going to be helpful.
> > >
> > > I was thinking that we need to make one minor change to the map API I
> proposed.  It should return a mapped size as an output parameter and take a
> flag as to whether partial mappings can be handled.  The effect would be
> that you never bounce to RAM which means that you can also quite accurately
> determine the maximum amount of bouncing (it should be proportional to the
> amount of MMIO memory that's registered).
> > >
> >
> > That's pointless; cirrus for example has 8MB of mmio while a cpu-to-vram
> blit is in progress, and some random device we'll add tomorrow could easily
> introduce more.  Our APIs shouldn't depend on properties of emulated
> hardware, at least as much as possible.
> >
>
>  One way to think of what I'm suggesting, is that if for every
> cpu_register_physical_memory call for MMIO, we allocated a buffer, then
> whenever map() was called on MMIO, we would return that already allocated
> buffer.  The overhead is fixed and honestly relatively small.  Much smaller
> than dma.c proposes.
>
>  But you can be smarter, and lazily allocate those buffers for MMIO.  That
> will reduce the up front memory consumption.  I'd be perfectly happy though
> if the first implementation just malloc() on cpu_register_physical_memory
> for the sake of simplicity.

Maybe the buffer thing should not be called DMA, but something like
bounce-buffers?

> > I'll enumerate the functions that dma.c provides:
> > - convert guest physical addresses to host virtual addresses
> >
>
>  The map() api does this.
>
>
> > - construct an iovec for scatter/gather
> >
>
>  The map() api does not do this.
>
>
> > - handle guest physical addresses for which no host virtual addresses
> exist, while controlling memory use
> > - take care of the dirty bit
> > - provide a place for adding hooks to hardware that can modify dma
> operations generically (emulated iommus, transforming dma engines)
> >
>
>  The map() api does all of this.
>
>
> > I believe that a dma api that fails to address all of these requirements
> is trying to solve too few problems at the same time, and will either cause
> dma clients to be unduly complicated, or will require rewriting.
> >
>
>  I think there's a disconnect between what you describe and what the current
> code does.  I think there's a very simple solution, let's start with the
> map() api.  I'm convinced that the virtio support for it will be trivial and
> that virtio will not benefit from the dma.c api proposed.  I'm willing to do
> the virtio map implementation to demonstrate this.  Let's see two
> implementations that use the dma.c api before we commit to it.  I'd like to
> see at least a network device and a block device.  I don't believe network
> devices will benefit from it because they don't support partial submissions.
>
>  I like any API that reduces duplicate code.  It's easy to demonstrate that
> with patches.  Based on my current understanding of the API and what I
> expect from the devices using it, I don't believe the API will actually do
> that.  It's quite easy to prove me wrong though.

I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
address to physical memory (see patch). ESP works (no zero copy yet),
Lance doesn't. It looks much better. Because the resolving activity is
performed in serial steps, unbounded IO vector allocation does not
happen, but we still could launch as many IO as there are free IO
vectors.

There are still some issues I'm not happy yet:
- handling of access violations: resolving should stop before the bad
page, the transfers should be done until that and then post error.
- bounce buffers needed for Lance byte swapping are not well designed (stack)

This lead me to the thought that maybe we should not hide the bounce
buffer activity, but instead make it more explicit for the device that
needs bouncing. For the other device, the buffering or lack of it
should be opaque.

Also the virtual-to-physical address resolution API could be generic,
ie all resolver functions should take same parameters so that the
devices would not need to know the next higher level device.

[-- Attachment #2: translation-api-sparc.diff --]
[-- Type: plain/text, Size: 13400 bytes --]

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-15 18:35                         ` Blue Swirl
@ 2008-12-15 22:06                           ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-15 22:06 UTC (permalink / raw)
  To: Blue Swirl
  Cc: qemu-devel, Avi Kivity, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Blue Swirl wrote:
> I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
> address to physical memory (see patch). ESP works (no zero copy yet),
> Lance doesn't. It looks much better. Because the resolving activity is
> performed in serial steps, unbounded IO vector allocation does not
> happen, but we still could launch as many IO as there are free IO
> vectors.
>   

It is a good cleanup.

> There are still some issues I'm not happy yet:
> - handling of access violations: resolving should stop before the bad
> page, the transfers should be done until that and then post error.
> - bounce buffers needed for Lance byte swapping are not well designed (stack)
>   

I think you could approach the bouncing via a map/unmap API but I'm not 
sure.  You would need a map() function to take a virtual address which 
is sort of weird.  That would allow you to stack them in an arbitrary 
fashion though.

> This lead me to the thought that maybe we should not hide the bounce
> buffer activity, but instead make it more explicit for the device that
> needs bouncing. For the other device, the buffering or lack of it
> should be opaque.
>   

I think that's reasonable.

> Also the virtual-to-physical address resolution API could be generic,
> ie all resolver functions should take same parameters so that the
> devices would not need to know the next higher level device.
>   

Yes.  I think this is key.  The only observation I would make is that 
the resolution API should have some sort of release function (so 
map/unmap, lock/unlock, whatever).

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-15 22:06                           ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-15 22:06 UTC (permalink / raw)
  To: Blue Swirl
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Avi Kivity

Blue Swirl wrote:
> I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
> address to physical memory (see patch). ESP works (no zero copy yet),
> Lance doesn't. It looks much better. Because the resolving activity is
> performed in serial steps, unbounded IO vector allocation does not
> happen, but we still could launch as many IO as there are free IO
> vectors.
>   

It is a good cleanup.

> There are still some issues I'm not happy yet:
> - handling of access violations: resolving should stop before the bad
> page, the transfers should be done until that and then post error.
> - bounce buffers needed for Lance byte swapping are not well designed (stack)
>   

I think you could approach the bouncing via a map/unmap API but I'm not 
sure.  You would need a map() function to take a virtual address which 
is sort of weird.  That would allow you to stack them in an arbitrary 
fashion though.

> This lead me to the thought that maybe we should not hide the bounce
> buffer activity, but instead make it more explicit for the device that
> needs bouncing. For the other device, the buffering or lack of it
> should be opaque.
>   

I think that's reasonable.

> Also the virtual-to-physical address resolution API could be generic,
> ie all resolver functions should take same parameters so that the
> devices would not need to know the next higher level device.
>   

Yes.  I think this is key.  The only observation I would make is that 
the resolution API should have some sort of release function (so 
map/unmap, lock/unlock, whatever).

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-15 22:06                           ` Anthony Liguori
@ 2008-12-16  9:41                             ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-16  9:41 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Blue Swirl, qemu-devel, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Anthony Liguori wrote:
>> There are still some issues I'm not happy yet:
>> - handling of access violations: resolving should stop before the bad
>> page, the transfers should be done until that and then post error.
>> - bounce buffers needed for Lance byte swapping are not well designed 
>> (stack)
>>   
>
> I think you could approach the bouncing via a map/unmap API but I'm 
> not sure.  You would need a map() function to take a virtual address 
> which is sort of weird.  That would allow you to stack them in an 
> arbitrary fashion though.
>

I think that's broken.  iommus converts physical addresses to physical 
addresses (or bus addresses), possibly generating faults along the way, 
and depending on the iommu context.  map()/unmap() converts physical/bus 
addresses to virtual addresses, possibly bouncing.  Except for both 
doing conversions, they're very different.

>> This lead me to the thought that maybe we should not hide the bounce
>> buffer activity, but instead make it more explicit for the device that
>> needs bouncing. For the other device, the buffering or lack of it
>> should be opaque.
>>   
>
> I think that's reasonable.

I don't understand.  It's not a device that needs bouncing, it's a 
particular transfer.  This could be either due to the transfer targeting 
mmio, or due to the transfer requiring a transformation.

>
>> Also the virtual-to-physical address resolution API could be generic,
>> ie all resolver functions should take same parameters so that the
>> devices would not need to know the next higher level device.
>>   
>
> Yes.  I think this is key.  The only observation I would make is that 
> the resolution API should have some sort of release function (so 
> map/unmap, lock/unlock, whatever).

Also, in order to support chaining, the input and output parameters need 
to be the same (both sglists).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16  9:41                             ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-16  9:41 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Blue Swirl, Gerd Hoffmann

Anthony Liguori wrote:
>> There are still some issues I'm not happy yet:
>> - handling of access violations: resolving should stop before the bad
>> page, the transfers should be done until that and then post error.
>> - bounce buffers needed for Lance byte swapping are not well designed 
>> (stack)
>>   
>
> I think you could approach the bouncing via a map/unmap API but I'm 
> not sure.  You would need a map() function to take a virtual address 
> which is sort of weird.  That would allow you to stack them in an 
> arbitrary fashion though.
>

I think that's broken.  iommus converts physical addresses to physical 
addresses (or bus addresses), possibly generating faults along the way, 
and depending on the iommu context.  map()/unmap() converts physical/bus 
addresses to virtual addresses, possibly bouncing.  Except for both 
doing conversions, they're very different.

>> This lead me to the thought that maybe we should not hide the bounce
>> buffer activity, but instead make it more explicit for the device that
>> needs bouncing. For the other device, the buffering or lack of it
>> should be opaque.
>>   
>
> I think that's reasonable.

I don't understand.  It's not a device that needs bouncing, it's a 
particular transfer.  This could be either due to the transfer targeting 
mmio, or due to the transfer requiring a transformation.

>
>> Also the virtual-to-physical address resolution API could be generic,
>> ie all resolver functions should take same parameters so that the
>> devices would not need to know the next higher level device.
>>   
>
> Yes.  I think this is key.  The only observation I would make is that 
> the resolution API should have some sort of release function (so 
> map/unmap, lock/unlock, whatever).

Also, in order to support chaining, the input and output parameters need 
to be the same (both sglists).

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-15 22:06                           ` Anthony Liguori
@ 2008-12-16 15:57                             ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 15:57 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: qemu-devel, Avi Kivity, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

On 12/16/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl wrote:
>
> > I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
> > address to physical memory (see patch). ESP works (no zero copy yet),
> > Lance doesn't. It looks much better. Because the resolving activity is
> > performed in serial steps, unbounded IO vector allocation does not
> > happen, but we still could launch as many IO as there are free IO
> > vectors.
> >
> >
>
>  It is a good cleanup.
>
>
> > There are still some issues I'm not happy yet:
> > - handling of access violations: resolving should stop before the bad
> > page, the transfers should be done until that and then post error.
> > - bounce buffers needed for Lance byte swapping are not well designed
> (stack)
> >
> >
>
>  I think you could approach the bouncing via a map/unmap API but I'm not
> sure.  You would need a map() function to take a virtual address which is
> sort of weird.  That would allow you to stack them in an arbitrary fashion
> though.

I'd still like to keep resolving virtual addresses to physical
addresses separate from physical address to host pointer mapping. I
think this was the enlightening discovery we made earlier.

The generic resolving API should look something like

int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
length_in, target_phys_addr_t &address_out, target_phys_addr_t
&length_out)

whereas the map/unmap API would be as you specified earlier.

> > This lead me to the thought that maybe we should not hide the bounce
> > buffer activity, but instead make it more explicit for the device that
> > needs bouncing. For the other device, the buffering or lack of it
> > should be opaque.
> >
> >
>
>  I think that's reasonable.
>
>
> > Also the virtual-to-physical address resolution API could be generic,
> > ie all resolver functions should take same parameters so that the
> > devices would not need to know the next higher level device.
> >
> >
>
>  Yes.  I think this is key.  The only observation I would make is that the
> resolution API should have some sort of release function (so map/unmap,
> lock/unlock, whatever).

I'm not so sure resolving works the same symmetric way, because the
virtual to physical mapping will change over time because of guest
activity. We could cache the resolved translations, but then there
should be a way to invalidate the cache entries throughout the device
chain with callbacks etc.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 15:57                             ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 15:57 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Avi Kivity

On 12/16/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl wrote:
>
> > I changed the ESP SCSI and Lance Ethernet on Sparc32 to resolve the IO
> > address to physical memory (see patch). ESP works (no zero copy yet),
> > Lance doesn't. It looks much better. Because the resolving activity is
> > performed in serial steps, unbounded IO vector allocation does not
> > happen, but we still could launch as many IO as there are free IO
> > vectors.
> >
> >
>
>  It is a good cleanup.
>
>
> > There are still some issues I'm not happy yet:
> > - handling of access violations: resolving should stop before the bad
> > page, the transfers should be done until that and then post error.
> > - bounce buffers needed for Lance byte swapping are not well designed
> (stack)
> >
> >
>
>  I think you could approach the bouncing via a map/unmap API but I'm not
> sure.  You would need a map() function to take a virtual address which is
> sort of weird.  That would allow you to stack them in an arbitrary fashion
> though.

I'd still like to keep resolving virtual addresses to physical
addresses separate from physical address to host pointer mapping. I
think this was the enlightening discovery we made earlier.

The generic resolving API should look something like

int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
length_in, target_phys_addr_t &address_out, target_phys_addr_t
&length_out)

whereas the map/unmap API would be as you specified earlier.

> > This lead me to the thought that maybe we should not hide the bounce
> > buffer activity, but instead make it more explicit for the device that
> > needs bouncing. For the other device, the buffering or lack of it
> > should be opaque.
> >
> >
>
>  I think that's reasonable.
>
>
> > Also the virtual-to-physical address resolution API could be generic,
> > ie all resolver functions should take same parameters so that the
> > devices would not need to know the next higher level device.
> >
> >
>
>  Yes.  I think this is key.  The only observation I would make is that the
> resolution API should have some sort of release function (so map/unmap,
> lock/unlock, whatever).

I'm not so sure resolving works the same symmetric way, because the
virtual to physical mapping will change over time because of guest
activity. We could cache the resolved translations, but then there
should be a way to invalidate the cache entries throughout the device
chain with callbacks etc.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16 15:57                             ` Blue Swirl
@ 2008-12-16 16:29                               ` Paul Brook
  -1 siblings, 0 replies; 111+ messages in thread
From: Paul Brook @ 2008-12-16 16:29 UTC (permalink / raw)
  To: qemu-devel
  Cc: Blue Swirl, Anthony Liguori, Andrea Arcangeli, chrisw, kvm,
	Gerd Hoffmann, Avi Kivity

> The generic resolving API should look something like
>
> int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
> length_in, target_phys_addr_t &address_out, target_phys_addr_t
> &length_out)

I don't think this is sufficient. A paged iommu may split a single range into 
multiple disjoint sections. i.e. we need SG lists.

Paul

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 16:29                               ` Paul Brook
  0 siblings, 0 replies; 111+ messages in thread
From: Paul Brook @ 2008-12-16 16:29 UTC (permalink / raw)
  To: qemu-devel
  Cc: Andrea Arcangeli, chrisw, kvm, Blue Swirl, Gerd Hoffmann, Avi Kivity

> The generic resolving API should look something like
>
> int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
> length_in, target_phys_addr_t &address_out, target_phys_addr_t
> &length_out)

I don't think this is sufficient. A paged iommu may split a single range into 
multiple disjoint sections. i.e. we need SG lists.

Paul

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16 16:29                               ` Paul Brook
@ 2008-12-16 16:35                                 ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 16:35 UTC (permalink / raw)
  To: Paul Brook
  Cc: qemu-devel, Anthony Liguori, Andrea Arcangeli, chrisw, kvm,
	Gerd Hoffmann, Avi Kivity

On 12/16/08, Paul Brook <paul@codesourcery.com> wrote:
> > The generic resolving API should look something like
>  >
>  > int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
>  > length_in, target_phys_addr_t &address_out, target_phys_addr_t
>  > &length_out)
>
>
> I don't think this is sufficient. A paged iommu may split a single range into
>  multiple disjoint sections. i.e. we need SG lists.

That's why there is the length_out, it is called repeatedly until the
length is zero or it fails.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 16:35                                 ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 16:35 UTC (permalink / raw)
  To: Paul Brook
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Avi Kivity

On 12/16/08, Paul Brook <paul@codesourcery.com> wrote:
> > The generic resolving API should look something like
>  >
>  > int (*resolve)(target_phys_addr_t address_in, target_phys_addr_t
>  > length_in, target_phys_addr_t &address_out, target_phys_addr_t
>  > &length_out)
>
>
> I don't think this is sufficient. A paged iommu may split a single range into
>  multiple disjoint sections. i.e. we need SG lists.

That's why there is the length_out, it is called repeatedly until the
length is zero or it fails.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16  9:41                             ` Avi Kivity
@ 2008-12-16 16:55                               ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 16:55 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Anthony Liguori, qemu-devel, Andrea Arcangeli, chrisw, kvm,
	Gerd Hoffmann

On 12/16/08, Avi Kivity <avi@redhat.com> wrote:
> Anthony Liguori wrote:
>
> >
> > > There are still some issues I'm not happy yet:
> > > - handling of access violations: resolving should stop before the bad
> > > page, the transfers should be done until that and then post error.
> > > - bounce buffers needed for Lance byte swapping are not well designed
> (stack)
> > >
> > >
> >
> > I think you could approach the bouncing via a map/unmap API but I'm not
> sure.  You would need a map() function to take a virtual address which is
> sort of weird.  That would allow you to stack them in an arbitrary fashion
> though.
> >
> >
>
>  I think that's broken.  iommus converts physical addresses to physical
> addresses (or bus addresses), possibly generating faults along the way, and
> depending on the iommu context.  map()/unmap() converts physical/bus
> addresses to virtual addresses, possibly bouncing.  Except for both doing
> conversions, they're very different.
>
>
> >
> > > This lead me to the thought that maybe we should not hide the bounce
> > > buffer activity, but instead make it more explicit for the device that
> > > needs bouncing. For the other device, the buffering or lack of it
> > > should be opaque.
> > >
> > >
> >
> > I think that's reasonable.
> >
>
>  I don't understand.  It's not a device that needs bouncing, it's a
> particular transfer.  This could be either due to the transfer targeting
> mmio, or due to the transfer requiring a transformation.

Should the bouncing be something more much complex, for example
negotiated between the devices? Or maybe the devices set up a
transforming and non-transforming channel (which the other end should
be able to transform some more) and use them as needed?

In the Lance case, some of the DMA is byte swapped and some not,
depending on the memory region used, some is swapped because of a
global bit in a register and finally the Sparc32 DMA controller
reverses the swapping.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 16:55                               ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 16:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann

On 12/16/08, Avi Kivity <avi@redhat.com> wrote:
> Anthony Liguori wrote:
>
> >
> > > There are still some issues I'm not happy yet:
> > > - handling of access violations: resolving should stop before the bad
> > > page, the transfers should be done until that and then post error.
> > > - bounce buffers needed for Lance byte swapping are not well designed
> (stack)
> > >
> > >
> >
> > I think you could approach the bouncing via a map/unmap API but I'm not
> sure.  You would need a map() function to take a virtual address which is
> sort of weird.  That would allow you to stack them in an arbitrary fashion
> though.
> >
> >
>
>  I think that's broken.  iommus converts physical addresses to physical
> addresses (or bus addresses), possibly generating faults along the way, and
> depending on the iommu context.  map()/unmap() converts physical/bus
> addresses to virtual addresses, possibly bouncing.  Except for both doing
> conversions, they're very different.
>
>
> >
> > > This lead me to the thought that maybe we should not hide the bounce
> > > buffer activity, but instead make it more explicit for the device that
> > > needs bouncing. For the other device, the buffering or lack of it
> > > should be opaque.
> > >
> > >
> >
> > I think that's reasonable.
> >
>
>  I don't understand.  It's not a device that needs bouncing, it's a
> particular transfer.  This could be either due to the transfer targeting
> mmio, or due to the transfer requiring a transformation.

Should the bouncing be something more much complex, for example
negotiated between the devices? Or maybe the devices set up a
transforming and non-transforming channel (which the other end should
be able to transform some more) and use them as needed?

In the Lance case, some of the DMA is byte swapped and some not,
depending on the memory region used, some is swapped because of a
global bit in a register and finally the Sparc32 DMA controller
reverses the swapping.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16 16:55                               ` Blue Swirl
@ 2008-12-16 17:09                                 ` Avi Kivity
  -1 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-16 17:09 UTC (permalink / raw)
  To: Blue Swirl
  Cc: Anthony Liguori, qemu-devel, Andrea Arcangeli, chrisw, kvm,
	Gerd Hoffmann

Blue Swirl wrote:
>>  I don't understand.  It's not a device that needs bouncing, it's a
>> particular transfer.  This could be either due to the transfer targeting
>> mmio, or due to the transfer requiring a transformation.
>>     
>
> Should the bouncing be something more much complex, for example
> negotiated between the devices? Or maybe the devices set up a
> transforming and non-transforming channel (which the other end should
> be able to transform some more) and use them as needed?
>   

Yes.  We already have two cases:

- may do partial request: useful for block storage where requests can be 
huge
- need full request: networking, where you can't send or receive half a 
packet; on the other hand, packets are small (even with tso)

You're adding a third case, always bounce, when the data undergoes a 
transformation which can't happen in-place.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 17:09                                 ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-16 17:09 UTC (permalink / raw)
  To: Blue Swirl; +Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann

Blue Swirl wrote:
>>  I don't understand.  It's not a device that needs bouncing, it's a
>> particular transfer.  This could be either due to the transfer targeting
>> mmio, or due to the transfer requiring a transformation.
>>     
>
> Should the bouncing be something more much complex, for example
> negotiated between the devices? Or maybe the devices set up a
> transforming and non-transforming channel (which the other end should
> be able to transform some more) and use them as needed?
>   

Yes.  We already have two cases:

- may do partial request: useful for block storage where requests can be 
huge
- need full request: networking, where you can't send or receive half a 
packet; on the other hand, packets are small (even with tso)

You're adding a third case, always bounce, when the data undergoes a 
transformation which can't happen in-place.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16 17:09                                 ` Avi Kivity
@ 2008-12-16 17:48                                   ` Anthony Liguori
  -1 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-16 17:48 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Blue Swirl, qemu-devel, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Avi Kivity wrote:
> Blue Swirl wrote:
>>>  I don't understand.  It's not a device that needs bouncing, it's a
>>> particular transfer.  This could be either due to the transfer 
>>> targeting
>>> mmio, or due to the transfer requiring a transformation.
>>>     
>>
>> Should the bouncing be something more much complex, for example
>> negotiated between the devices? Or maybe the devices set up a
>> transforming and non-transforming channel (which the other end should
>> be able to transform some more) and use them as needed?
>>   
>
> Yes.  We already have two cases:
>
> - may do partial request: useful for block storage where requests can 
> be huge
> - need full request: networking, where you can't send or receive half 
> a packet; on the other hand, packets are small (even with tso)
>
> You're adding a third case, always bounce, when the data undergoes a 
> transformation which can't happen in-place.

IMHO, IOMMU translation is distinct from mapping/data transformation.  I 
would think the IOMMU translation API would be different in that it took 
a physical address and returned a physical address.

The IOMMU DMA API (which could transform data potentially) should return 
a virtual address and data a physical address.  In the normal case 
(non-transforming IOMMU), it should just fall-through to CPU DMA API 
(which is just map/unmap).

The PCI DMA API would normally fall through to the IOMMU DMA API.

So we would have:

PCI DMA map(addr):
  if IOMMU:
     return IOMMU DMA map(addr)
  return CPU DMA map(addr)

IOMMU DMA map(addr):
  new_address = IOMMU translate(addr)
  if transform:
    return IOMMU byte-swap map(addr)
  return CPU DMA map(addr)

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 17:48                                   ` Anthony Liguori
  0 siblings, 0 replies; 111+ messages in thread
From: Anthony Liguori @ 2008-12-16 17:48 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Blue Swirl, Gerd Hoffmann

Avi Kivity wrote:
> Blue Swirl wrote:
>>>  I don't understand.  It's not a device that needs bouncing, it's a
>>> particular transfer.  This could be either due to the transfer 
>>> targeting
>>> mmio, or due to the transfer requiring a transformation.
>>>     
>>
>> Should the bouncing be something more much complex, for example
>> negotiated between the devices? Or maybe the devices set up a
>> transforming and non-transforming channel (which the other end should
>> be able to transform some more) and use them as needed?
>>   
>
> Yes.  We already have two cases:
>
> - may do partial request: useful for block storage where requests can 
> be huge
> - need full request: networking, where you can't send or receive half 
> a packet; on the other hand, packets are small (even with tso)
>
> You're adding a third case, always bounce, when the data undergoes a 
> transformation which can't happen in-place.

IMHO, IOMMU translation is distinct from mapping/data transformation.  I 
would think the IOMMU translation API would be different in that it took 
a physical address and returned a physical address.

The IOMMU DMA API (which could transform data potentially) should return 
a virtual address and data a physical address.  In the normal case 
(non-transforming IOMMU), it should just fall-through to CPU DMA API 
(which is just map/unmap).

The PCI DMA API would normally fall through to the IOMMU DMA API.

So we would have:

PCI DMA map(addr):
  if IOMMU:
     return IOMMU DMA map(addr)
  return CPU DMA map(addr)

IOMMU DMA map(addr):
  new_address = IOMMU translate(addr)
  if transform:
    return IOMMU byte-swap map(addr)
  return CPU DMA map(addr)

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-16 17:48                                   ` Anthony Liguori
@ 2008-12-16 18:11                                     ` Blue Swirl
  -1 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 18:11 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Avi Kivity, qemu-devel, Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

On 12/16/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Avi Kivity wrote:
>
> > Blue Swirl wrote:
> >
> > >
> > > >  I don't understand.  It's not a device that needs bouncing, it's a
> > > > particular transfer.  This could be either due to the transfer
> targeting
> > > > mmio, or due to the transfer requiring a transformation.
> > > >
> > > >
> > >
> > > Should the bouncing be something more much complex, for example
> > > negotiated between the devices? Or maybe the devices set up a
> > > transforming and non-transforming channel (which the other end should
> > > be able to transform some more) and use them as needed?
> > >
> > >
> >
> > Yes.  We already have two cases:
> >
> > - may do partial request: useful for block storage where requests can be
> huge
> > - need full request: networking, where you can't send or receive half a
> packet; on the other hand, packets are small (even with tso)
> >
> > You're adding a third case, always bounce, when the data undergoes a
> transformation which can't happen in-place.
> >
>
>  IMHO, IOMMU translation is distinct from mapping/data transformation.  I
> would think the IOMMU translation API would be different in that it took a
> physical address and returned a physical address.

Fully agree. On Sparc32, the incoming address is called DVMA (device
virtual memory address).

>  The IOMMU DMA API (which could transform data potentially) should return a
> virtual address and data a physical address.  In the normal case
> (non-transforming IOMMU), it should just fall-through to CPU DMA API (which
> is just map/unmap).
>
>  The PCI DMA API would normally fall through to the IOMMU DMA API.
>
>  So we would have:
>
>  PCI DMA map(addr):
>   if IOMMU:
>     return IOMMU DMA map(addr)
>   return CPU DMA map(addr)
>
>  IOMMU DMA map(addr):
>   new_address = IOMMU translate(addr)
>   if transform:
>    return IOMMU byte-swap map(addr)
>   return CPU DMA map(addr)

But with this we would be back to the original merged API.

I'd propose three to four distinct steps:
1. Resolve the device addresses via IOMMU etc. to guest physical
addresses (typically no changes if no IOMMU)
2. Negotiate bouncing (if bounced, the step returns a host address, if
not, a physical address)
3. Map the physical addresses to host addresses, bypass for the bounce cases
4. Perform vectorized zero-copy AIO. (Profit?)

2 & 3 may/should be merged but I'm not sure we have the full picture yet.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-16 18:11                                     ` Blue Swirl
  0 siblings, 0 replies; 111+ messages in thread
From: Blue Swirl @ 2008-12-16 18:11 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, chrisw, kvm, qemu-devel, Gerd Hoffmann, Avi Kivity

On 12/16/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Avi Kivity wrote:
>
> > Blue Swirl wrote:
> >
> > >
> > > >  I don't understand.  It's not a device that needs bouncing, it's a
> > > > particular transfer.  This could be either due to the transfer
> targeting
> > > > mmio, or due to the transfer requiring a transformation.
> > > >
> > > >
> > >
> > > Should the bouncing be something more much complex, for example
> > > negotiated between the devices? Or maybe the devices set up a
> > > transforming and non-transforming channel (which the other end should
> > > be able to transform some more) and use them as needed?
> > >
> > >
> >
> > Yes.  We already have two cases:
> >
> > - may do partial request: useful for block storage where requests can be
> huge
> > - need full request: networking, where you can't send or receive half a
> packet; on the other hand, packets are small (even with tso)
> >
> > You're adding a third case, always bounce, when the data undergoes a
> transformation which can't happen in-place.
> >
>
>  IMHO, IOMMU translation is distinct from mapping/data transformation.  I
> would think the IOMMU translation API would be different in that it took a
> physical address and returned a physical address.

Fully agree. On Sparc32, the incoming address is called DVMA (device
virtual memory address).

>  The IOMMU DMA API (which could transform data potentially) should return a
> virtual address and data a physical address.  In the normal case
> (non-transforming IOMMU), it should just fall-through to CPU DMA API (which
> is just map/unmap).
>
>  The PCI DMA API would normally fall through to the IOMMU DMA API.
>
>  So we would have:
>
>  PCI DMA map(addr):
>   if IOMMU:
>     return IOMMU DMA map(addr)
>   return CPU DMA map(addr)
>
>  IOMMU DMA map(addr):
>   new_address = IOMMU translate(addr)
>   if transform:
>    return IOMMU byte-swap map(addr)
>   return CPU DMA map(addr)

But with this we would be back to the original merged API.

I'd propose three to four distinct steps:
1. Resolve the device addresses via IOMMU etc. to guest physical
addresses (typically no changes if no IOMMU)
2. Negotiate bouncing (if bounced, the step returns a host address, if
not, a physical address)
3. Map the physical addresses to host addresses, bypass for the bounce cases
4. Perform vectorized zero-copy AIO. (Profit?)

2 & 3 may/should be merged but I'm not sure we have the full picture yet.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-14 19:59                     ` [Qemu-devel] " Avi Kivity
@ 2008-12-22 16:44                       ` Ian Jackson
  -1 siblings, 0 replies; 111+ messages in thread
From: Ian Jackson @ 2008-12-22 16:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, kvm, Gerd Hoffmann

Avi Kivity writes ("[Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO"):
> Newer Xen shouldn't have this problem though; it runs qemu in kernel 
> mode in a dedicated 64-bit domain.

This is not yet the default configuration and there are still various
reasons why you might not want to use it.  I expect we'll retain the
dom0 user process option for a while yet.

I think there is continued value in being able to emulate a guest
whose physical address space exceeds the virtual address space in the
host.  The whole assumption that all guest accessible RAM is mapped at
once, contiguously, is I think wrong.

Ian.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-22 16:44                       ` Ian Jackson
  0 siblings, 0 replies; 111+ messages in thread
From: Ian Jackson @ 2008-12-22 16:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity writes ("[Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO"):
> Newer Xen shouldn't have this problem though; it runs qemu in kernel 
> mode in a dedicated 64-bit domain.

This is not yet the default configuration and there are still various
reasons why you might not want to use it.  I expect we'll retain the
dom0 user process option for a while yet.

I think there is continued value in being able to emulate a guest
whose physical address space exceeds the virtual address space in the
host.  The whole assumption that all guest accessible RAM is mapped at
once, contiguously, is I think wrong.

Ian.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-22 16:44                       ` Ian Jackson
  (?)
@ 2008-12-22 19:44                       ` Avi Kivity
  2008-12-23  0:03                         ` Thiemo Seufer
  -1 siblings, 1 reply; 111+ messages in thread
From: Avi Kivity @ 2008-12-22 19:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Ian Jackson wrote:
> Avi Kivity writes ("[Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO"):
>   
>> Newer Xen shouldn't have this problem though; it runs qemu in kernel 
>> mode in a dedicated 64-bit domain.
>>     
>
> I think there is continued value in being able to emulate a guest
> whose physical address space exceeds the virtual address space in the
> host.  The whole assumption that all guest accessible RAM is mapped at
> once, contiguously, is I think wrong.
>   

I agree that we shouldn't expect memory to be contiguous, in order to 
properly support hotplug.  But I see zero value in trying to support 
large memory configurations on 32-bit in 2008.  This is what 64-bit 
systems are for!  If Xen has a problem with 64-bit hosts, we can try to 
accommodate it, but to have 32-bit qemu/tcg or qemu/kvm support large 
address spaces is pointless IMO.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-22 16:44                       ` Ian Jackson
  (?)
  (?)
@ 2008-12-22 19:46                       ` Avi Kivity
  2009-01-05 10:27                           ` Gerd Hoffmann
  -1 siblings, 1 reply; 111+ messages in thread
From: Avi Kivity @ 2008-12-22 19:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Ian Jackson wrote:
> Avi Kivity writes ("[Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO"):
>   
>> Newer Xen shouldn't have this problem though; it runs qemu in kernel 
>> mode in a dedicated 64-bit domain.
>>     
>
> I think there is continued value in being able to emulate a guest
> whose physical address space exceeds the virtual address space in the
> host.  The whole assumption that all guest accessible RAM is mapped at
> once, contiguously, is I think wrong.
>   

I agree that we shouldn't expect memory to be contiguous, in order to 
properly support hotplug.  But I see zero value in trying to support 
large memory configurations on 32-bit in 2008.  This is what 64-bit 
systems are for!  If Xen has a problem with 64-bit hosts, we can try to 
accommodate it, but to have 32-bit qemu/tcg or qemu/kvm support large 
address spaces is pointless IMO.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-22 19:44                       ` Avi Kivity
@ 2008-12-23  0:03                         ` Thiemo Seufer
  2008-12-23  1:02                             ` Andrea Arcangeli
  2008-12-23 17:31                             ` Avi Kivity
  0 siblings, 2 replies; 111+ messages in thread
From: Thiemo Seufer @ 2008-12-23  0:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Avi Kivity wrote:
> Ian Jackson wrote:
>> Avi Kivity writes ("[Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO"):
>>   
>>> Newer Xen shouldn't have this problem though; it runs qemu in kernel  
>>> mode in a dedicated 64-bit domain.
>>>     
>>
>> I think there is continued value in being able to emulate a guest
>> whose physical address space exceeds the virtual address space in the
>> host.  The whole assumption that all guest accessible RAM is mapped at
>> once, contiguously, is I think wrong.
>>   
>
> I agree that we shouldn't expect memory to be contiguous, in order to  
> properly support hotplug.  But I see zero value in trying to support  
> large memory configurations on 32-bit in 2008.  This is what 64-bit  
> systems are for!  If Xen has a problem with 64-bit hosts, we can try to  
> accommodate it, but to have 32-bit qemu/tcg

Running x86-64 binaries on a (non-x86) 32-bit host is IMHO quite an
obvious application for qemu/tcg.

> or qemu/kvm support large address spaces is pointless IMO.

Interestingly, Virtualbox just started to support
64-bit-target-on-32-bit-host.


Thiemo

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-23  0:03                         ` Thiemo Seufer
@ 2008-12-23  1:02                             ` Andrea Arcangeli
  2008-12-23 17:31                             ` Avi Kivity
  1 sibling, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-23  1:02 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: qemu-devel, chrisw, Gerd Hoffmann, kvm

On Tue, Dec 23, 2008 at 01:03:36AM +0100, Thiemo Seufer wrote:
> Running x86-64 binaries on a (non-x86) 32-bit host is IMHO quite an
> obvious application for qemu/tcg.

I doubt you need device drivers when only translating bytecode. DMA
API discussed here is an API for drivers only AFIK.

> Interestingly, Virtualbox just started to support
> 64-bit-target-on-32-bit-host.

Yes, and KVM supports this already AFIK (without PR and without any
dynamic map/unmap operation that doesn't exist today).

The issue here is not to run 64bit target on 32bit host, this is about
running a 64bit target with >3G (or >4G of ram perhaps on sparc32 that
has separate kernel/user address space) of ram on a 32bit host.

What we're discussing here is a new feature. It's hard to imagine
any not-x86 32bit system, to want to run a >3G ram x86-64 (or any
other 64bit) VM. It wouldn't work today, and I'm not particularly
interested to make it work in 2009 and later given it's 99.9% more
likely that you have a 64bit capable CPU than >4G of ram on your
system. The limit isn't the size of the guest physical address space,
the limit is the amount of RAM that you want to give to the 64bit
guest.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-23  1:02                             ` Andrea Arcangeli
  0 siblings, 0 replies; 111+ messages in thread
From: Andrea Arcangeli @ 2008-12-23  1:02 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: chrisw, qemu-devel, kvm, Gerd Hoffmann

On Tue, Dec 23, 2008 at 01:03:36AM +0100, Thiemo Seufer wrote:
> Running x86-64 binaries on a (non-x86) 32-bit host is IMHO quite an
> obvious application for qemu/tcg.

I doubt you need device drivers when only translating bytecode. DMA
API discussed here is an API for drivers only AFIK.

> Interestingly, Virtualbox just started to support
> 64-bit-target-on-32-bit-host.

Yes, and KVM supports this already AFIK (without PR and without any
dynamic map/unmap operation that doesn't exist today).

The issue here is not to run 64bit target on 32bit host, this is about
running a 64bit target with >3G (or >4G of ram perhaps on sparc32 that
has separate kernel/user address space) of ram on a 32bit host.

What we're discussing here is a new feature. It's hard to imagine
any not-x86 32bit system, to want to run a >3G ram x86-64 (or any
other 64bit) VM. It wouldn't work today, and I'm not particularly
interested to make it work in 2009 and later given it's 99.9% more
likely that you have a 64bit capable CPU than >4G of ram on your
system. The limit isn't the size of the guest physical address space,
the limit is the amount of RAM that you want to give to the 64bit
guest.

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-23  0:03                         ` Thiemo Seufer
@ 2008-12-23 17:31                             ` Avi Kivity
  2008-12-23 17:31                             ` Avi Kivity
  1 sibling, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-23 17:31 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: qemu-devel, Andrea Arcangeli, chrisw, Gerd Hoffmann, kvm

Thiemo Seufer wrote:

  

>> I agree that we shouldn't expect memory to be contiguous, in order to  
>> properly support hotplug.  But I see zero value in trying to support  
>> large memory configurations on 32-bit in 2008.  This is what 64-bit  
>> systems are for!  If Xen has a problem with 64-bit hosts, we can try to  
>> accommodate it, but to have 32-bit qemu/tcg
>>     
>
> Running x86-64 binaries on a (non-x86) 32-bit host is IMHO quite an
> obvious application for qemu/tcg.
>   

I agree.  I don't think running guests with >2GB of RAM on 32-bit hosts 
is a good idea, though.  Instruction set, physical address space width, 
and actual maximum memory size supported are very different things.

I'm sure you don't want qemu to swap memory in and out of its address 
space (which is what Xen does).

>> or qemu/kvm support large address spaces is pointless IMO.
>>     
>
> Interestingly, Virtualbox just started to support
> 64-bit-target-on-32-bit-host.
>   

That makes sense on Windows, where 32-bit hosts still have an advantage 
over 64-bit.  But again, that's very different from supporting a large 
address space.

Want 64-bit guests? sure, qemu/tcg can handle it, as well as qemu/kvm 
(on a 64-bit host).  Want lots of memory for your CAD 
application/openoffice docs/qemu guests?  Use a 64-bit host.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2008-12-23 17:31                             ` Avi Kivity
  0 siblings, 0 replies; 111+ messages in thread
From: Avi Kivity @ 2008-12-23 17:31 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm, Gerd Hoffmann

Thiemo Seufer wrote:

  

>> I agree that we shouldn't expect memory to be contiguous, in order to  
>> properly support hotplug.  But I see zero value in trying to support  
>> large memory configurations on 32-bit in 2008.  This is what 64-bit  
>> systems are for!  If Xen has a problem with 64-bit hosts, we can try to  
>> accommodate it, but to have 32-bit qemu/tcg
>>     
>
> Running x86-64 binaries on a (non-x86) 32-bit host is IMHO quite an
> obvious application for qemu/tcg.
>   

I agree.  I don't think running guests with >2GB of RAM on 32-bit hosts 
is a good idea, though.  Instruction set, physical address space width, 
and actual maximum memory size supported are very different things.

I'm sure you don't want qemu to swap memory in and out of its address 
space (which is what Xen does).

>> or qemu/kvm support large address spaces is pointless IMO.
>>     
>
> Interestingly, Virtualbox just started to support
> 64-bit-target-on-32-bit-host.
>   

That makes sense on Windows, where 32-bit hosts still have an advantage 
over 64-bit.  But again, that's very different from supporting a large 
address space.

Want 64-bit guests? sure, qemu/tcg can handle it, as well as qemu/kvm 
(on a 64-bit host).  Want lots of memory for your CAD 
application/openoffice docs/qemu guests?  Use a 64-bit host.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
  2008-12-22 19:46                       ` Avi Kivity
@ 2009-01-05 10:27                           ` Gerd Hoffmann
  0 siblings, 0 replies; 111+ messages in thread
From: Gerd Hoffmann @ 2009-01-05 10:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel, Andrea Arcangeli, chrisw, kvm

Avi Kivity wrote:
> Ian Jackson wrote:
>> I think there is continued value in being able to emulate a guest
>> whose physical address space exceeds the virtual address space in the
>> host.  The whole assumption that all guest accessible RAM is mapped at
>> once, contiguously, is I think wrong.
> 
> If Xen has a problem with 64-bit hosts, we can try to
> accommodate it,

Yes, this is the whole point.  Flesh out the DMA api in a way that Xen
can use it.  Large memory guests are only one reason why Xen needs
map/unmap.  The other one is grant table support.

> but to have 32-bit qemu/tcg or qemu/kvm support large
> address spaces is pointless IMO.

You don't have to support that.

I think the DMA api should look like this in the end:

struct qemu_dma_ops {
     map(...);
     xfer(...);
     unmap(...);
};

The qemu/xen implementation would actually map/unmap.

For qemu/tcg and qemu/kvm which has all guest memory permanently mapped
map() would be a guest-physical -> host-virtual address translation and
unmap() would be a nop (or refcount--).

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 111+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2 of 5] add can_dma/post_dma for direct IO
@ 2009-01-05 10:27                           ` Gerd Hoffmann
  0 siblings, 0 replies; 111+ messages in thread
From: Gerd Hoffmann @ 2009-01-05 10:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, chrisw, qemu-devel, kvm

Avi Kivity wrote:
> Ian Jackson wrote:
>> I think there is continued value in being able to emulate a guest
>> whose physical address space exceeds the virtual address space in the
>> host.  The whole assumption that all guest accessible RAM is mapped at
>> once, contiguously, is I think wrong.
> 
> If Xen has a problem with 64-bit hosts, we can try to
> accommodate it,

Yes, this is the whole point.  Flesh out the DMA api in a way that Xen
can use it.  Large memory guests are only one reason why Xen needs
map/unmap.  The other one is grant table support.

> but to have 32-bit qemu/tcg or qemu/kvm support large
> address spaces is pointless IMO.

You don't have to support that.

I think the DMA api should look like this in the end:

struct qemu_dma_ops {
     map(...);
     xfer(...);
     unmap(...);
};

The qemu/xen implementation would actually map/unmap.

For qemu/tcg and qemu/kvm which has all guest memory permanently mapped
map() would be a guest-physical -> host-virtual address translation and
unmap() would be a nop (or refcount--).

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 111+ messages in thread

end of thread, other threads:[~2009-01-05 10:28 UTC | newest]

Thread overview: 111+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-12-12 18:16 [PATCH 0 of 5] dma api v3 Andrea Arcangeli
2008-12-12 18:16 ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 18:16 ` [PATCH 1 of 5] fix cpu_physical_memory len Andrea Arcangeli
2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 19:06   ` Anthony Liguori
2008-12-12 19:06     ` [Qemu-devel] " Anthony Liguori
2008-12-12 19:26     ` Andrea Arcangeli
2008-12-12 19:26       ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 18:16 ` [PATCH 2 of 5] add can_dma/post_dma for direct IO Andrea Arcangeli
2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 19:00   ` Blue Swirl
2008-12-12 19:00     ` Blue Swirl
2008-12-12 19:18     ` Anthony Liguori
2008-12-12 19:18       ` Anthony Liguori
2008-12-12 20:05       ` Blue Swirl
2008-12-12 20:10         ` Anthony Liguori
2008-12-12 20:10           ` Anthony Liguori
2008-12-12 19:15   ` Anthony Liguori
2008-12-12 19:15     ` [Qemu-devel] " Anthony Liguori
2008-12-12 19:37     ` Andrea Arcangeli
2008-12-12 19:37       ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 20:09       ` Anthony Liguori
2008-12-12 20:09         ` [Qemu-devel] " Anthony Liguori
2008-12-12 20:25       ` Gerd Hoffmann
2008-12-12 20:25         ` [Qemu-devel] " Gerd Hoffmann
2008-12-12 19:39     ` Anthony Liguori
2008-12-12 19:39       ` [Qemu-devel] " Anthony Liguori
2008-12-13  9:22       ` Avi Kivity
2008-12-13  9:22         ` Avi Kivity
2008-12-13 16:45         ` Anthony Liguori
2008-12-13 16:45           ` Anthony Liguori
2008-12-13 19:48           ` Avi Kivity
2008-12-13 19:48             ` Avi Kivity
2008-12-13 21:07             ` Anthony Liguori
2008-12-13 21:07               ` Anthony Liguori
2008-12-14  6:03               ` Avi Kivity
2008-12-14  6:03                 ` Avi Kivity
2008-12-14 19:10                 ` Anthony Liguori
2008-12-14 19:10                   ` Anthony Liguori
2008-12-14 19:49                   ` Avi Kivity
2008-12-14 19:49                     ` Avi Kivity
2008-12-14 23:08                     ` Anthony Liguori
2008-12-14 23:08                       ` Anthony Liguori
2008-12-15  0:57                       ` Paul Brook
2008-12-15  0:57                         ` Paul Brook
2008-12-15  2:09                         ` Anthony Liguori
2008-12-15  2:09                           ` Anthony Liguori
2008-12-15  6:23                           ` Avi Kivity
2008-12-15  6:23                             ` Avi Kivity
2008-12-15 18:35                       ` Blue Swirl
2008-12-15 18:35                         ` Blue Swirl
2008-12-15 22:06                         ` Anthony Liguori
2008-12-15 22:06                           ` Anthony Liguori
2008-12-16  9:41                           ` Avi Kivity
2008-12-16  9:41                             ` Avi Kivity
2008-12-16 16:55                             ` Blue Swirl
2008-12-16 16:55                               ` Blue Swirl
2008-12-16 17:09                               ` Avi Kivity
2008-12-16 17:09                                 ` Avi Kivity
2008-12-16 17:48                                 ` Anthony Liguori
2008-12-16 17:48                                   ` Anthony Liguori
2008-12-16 18:11                                   ` Blue Swirl
2008-12-16 18:11                                     ` Blue Swirl
2008-12-16 15:57                           ` Blue Swirl
2008-12-16 15:57                             ` Blue Swirl
2008-12-16 16:29                             ` Paul Brook
2008-12-16 16:29                               ` Paul Brook
2008-12-16 16:35                               ` Blue Swirl
2008-12-16 16:35                                 ` Blue Swirl
2008-12-14 17:30       ` Jamie Lokier
2008-12-14 17:30         ` Jamie Lokier
2008-12-13 14:39     ` Andrea Arcangeli
2008-12-13 14:39       ` [Qemu-devel] " Andrea Arcangeli
2008-12-13 16:46       ` Anthony Liguori
2008-12-13 16:46         ` [Qemu-devel] " Anthony Liguori
2008-12-13 16:53         ` Andrea Arcangeli
2008-12-13 16:53           ` [Qemu-devel] " Andrea Arcangeli
2008-12-13 17:54           ` Andreas Färber
2008-12-13 21:11           ` Anthony Liguori
2008-12-13 21:11             ` [Qemu-devel] " Anthony Liguori
2008-12-14 16:47             ` Andrea Arcangeli
2008-12-14 16:47               ` [Qemu-devel] " Andrea Arcangeli
2008-12-14 17:01               ` Avi Kivity
2008-12-14 17:01                 ` [Qemu-devel] " Avi Kivity
2008-12-14 17:15                 ` Andrea Arcangeli
2008-12-14 17:15                   ` [Qemu-devel] " Andrea Arcangeli
2008-12-14 19:59                   ` Avi Kivity
2008-12-14 19:59                     ` [Qemu-devel] " Avi Kivity
2008-12-22 16:44                     ` Ian Jackson
2008-12-22 16:44                       ` Ian Jackson
2008-12-22 19:44                       ` Avi Kivity
2008-12-23  0:03                         ` Thiemo Seufer
2008-12-23  1:02                           ` Andrea Arcangeli
2008-12-23  1:02                             ` Andrea Arcangeli
2008-12-23 17:31                           ` Avi Kivity
2008-12-23 17:31                             ` Avi Kivity
2008-12-22 19:46                       ` Avi Kivity
2009-01-05 10:27                         ` Gerd Hoffmann
2009-01-05 10:27                           ` Gerd Hoffmann
2008-12-13 22:47     ` Anthony Liguori
2008-12-13 22:47       ` [Qemu-devel] " Anthony Liguori
2008-12-14  6:07       ` Avi Kivity
2008-12-14  6:07         ` [Qemu-devel] " Avi Kivity
2008-12-12 18:16 ` [PATCH 3 of 5] rename dma.c to isa_dma.c Andrea Arcangeli
2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 18:16 ` [PATCH 4 of 5] dma api Andrea Arcangeli
2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli
2008-12-12 18:55   ` Blue Swirl
2008-12-12 18:55     ` Blue Swirl
2008-12-12 18:16 ` [PATCH 5 of 5] bdrv_aio_readv/writev Andrea Arcangeli
2008-12-12 18:16   ` [Qemu-devel] " Andrea Arcangeli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.