All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Replace posix-aio with custom thread pool
@ 2008-12-05 21:21 ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-05 21:21 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori

glibc implements posix-aio as a thread pool and imposes a number of limitations.

1) it limits one request per-file descriptor.  we hack around this by dup()'ing
file descriptors which is hideously ugly

2) it's impossible to add new interfaces and we need a vectored read/write
operation to properly support a zero-copy API.

What has been suggested to me by glibc folks, is to implement whatever new
interfaces we want and then it can eventually be proposed for standardization.
This requires that we implement our own posix-aio implementation though.

This patch implements posix-aio using pthreads.  It immediately eliminates the
need for fd pooling.

It performs at least as well as the current posix-aio code (in some
circumstances, even better).

My only concern here is non-Linux Unices like FreeBSD.  They have kernel support
for posix-aio.  Since we cannot extend those interfaces though, I think that
even on those platforms we should still use a thread pool.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile b/Makefile
index 76470a4..030a6ae 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o
 ifdef CONFIG_WIN32
 BLOCK_OBJS += block-raw-win32.o
 else
+ifdef CONFIG_AIO
+BLOCK_OBJS += posix-aio-compat.o
+endif
 BLOCK_OBJS += block-raw-posix.o
 endif
 
diff --git a/Makefile.target b/Makefile.target
index 671d72a..32dfb85 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -595,6 +595,9 @@ endif
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
+ifdef CONFIG_AIO
+OBJS+=posix-aio-compat.o
+endif
 OBJS+=block-raw-posix.o
 endif
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 0a06a12..74b875a 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -27,7 +27,7 @@
 #include "block_int.h"
 #include <assert.h>
 #ifdef CONFIG_AIO
-#include <aio.h>
+#include "posix-aio-compat.h"
 #endif
 
 #ifdef CONFIG_COCOA
@@ -93,16 +93,10 @@
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
-/* posix-aio doesn't allow multiple outstanding requests to a single file
- * descriptor.  we implement a pool of dup()'d file descriptors to work
- * around this */
-#define RAW_FD_POOL_SIZE	64
-
 typedef struct BDRVRawState {
     int fd;
     int type;
     unsigned int lseek_err_cnt;
-    int fd_pool[RAW_FD_POOL_SIZE];
 #if defined(__linux__)
     /* linux floppy specific */
     int fd_open_flags;
@@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
-    int i;
 
     posix_aio_init();
 
@@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
     s->aligned_buf = NULL;
     if ((flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
@@ -446,7 +437,6 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
 
 typedef struct RawAIOCB {
     BlockDriverAIOCB common;
-    int fd;
     struct aiocb aiocb;
     struct RawAIOCB *next;
     int ret;
@@ -458,38 +448,6 @@ typedef struct PosixAioState
     RawAIOCB *first_aio;
 } PosixAioState;
 
-static int raw_fd_pool_get(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        /* already in use */
-        if (s->fd_pool[i] != -1)
-            continue;
-
-        /* try to dup file descriptor */
-        s->fd_pool[i] = dup(s->fd);
-        if (s->fd_pool[i] != -1)
-            return s->fd_pool[i];
-    }
-
-    /* we couldn't dup the file descriptor so just use the main one */
-    return s->fd;
-}
-
-static void raw_fd_pool_put(RawAIOCB *acb)
-{
-    BDRVRawState *s = acb->common.bs->opaque;
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] == acb->fd) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
-
 static void posix_aio_read(void *opaque)
 {
     PosixAioState *s = opaque;
@@ -519,7 +477,6 @@ static void posix_aio_read(void *opaque)
             if (ret == ECANCELED) {
                 /* remove the request */
                 *pacb = acb->next;
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
             } else if (ret != EINPROGRESS) {
                 /* end of aio */
@@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque)
                 *pacb = acb->next;
                 /* call the callback */
                 acb->common.cb(acb->common.opaque, ret);
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
                 break;
             } else {
@@ -571,6 +527,7 @@ static int posix_aio_init(void)
     struct sigaction act;
     PosixAioState *s;
     int fds[2];
+    struct aioinit ai;
   
     if (posix_aio_state)
         return 0;
@@ -598,24 +555,11 @@ static int posix_aio_init(void)
 
     qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
 
-#if defined(__linux__)
-    {
-        struct aioinit ai;
+    memset(&ai, 0, sizeof(ai));
+    ai.aio_threads = 64;
+    ai.aio_num = 64;
+    aio_init(&ai);
 
-        memset(&ai, 0, sizeof(ai));
-#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4)
-        ai.aio_threads = 64;
-        ai.aio_num = 64;
-#else
-        /* XXX: aio thread exit seems to hang on RedHat 9 and this init
-           seems to fix the problem. */
-        ai.aio_threads = 1;
-        ai.aio_num = 1;
-        ai.aio_idle_time = 365 * 100000;
-#endif
-        aio_init(&ai);
-    }
-#endif
     posix_aio_state = s;
 
     return 0;
@@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->fd = raw_fd_pool_get(s);
-    acb->aiocb.aio_fildes = acb->fd;
+    acb->aiocb.aio_fildes = s->fd;
     acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
             break;
         } else if (*pacb == acb) {
             *pacb = acb->next;
-            raw_fd_pool_put(acb);
             qemu_aio_release(acb);
             break;
         }
         pacb = &acb->next;
     }
 }
-
 #else /* CONFIG_AIO */
 static int posix_aio_init(void)
 {
@@ -753,17 +694,6 @@ static int posix_aio_init(void)
 }
 #endif /* CONFIG_AIO */
 
-static void raw_close_fd_pool(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] != -1) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
 
 static void raw_close(BlockDriverState *bs)
 {
@@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs)
         if (s->aligned_buf != NULL)
             qemu_free(s->aligned_buf);
     }
-    raw_close_fd_pool(s);
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -895,6 +824,7 @@ BlockDriver bdrv_raw = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_truncate = raw_truncate,
@@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
-    int fd, open_flags, ret, i;
+    int fd, open_flags, ret;
 
     posix_aio_init();
 
@@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
 #if defined(__linux__)
     /* close fd so that we can reopen it as needed */
     if (s->type == FTYPE_FD) {
@@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs)
         (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         close(s->fd);
         s->fd = -1;
-        raw_close_fd_pool(s);
 #ifdef DEBUG_FLOPPY
         printf("Floppy closed\n");
 #endif
@@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
             if (s->fd >= 0) {
                 close(s->fd);
                 s->fd = -1;
-                raw_close_fd_pool(s);
             }
             fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
             if (fd >= 0) {
@@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_getlength = raw_getlength,
diff --git a/configure b/configure
index 7f82786..146b3fc 100755
--- a/configure
+++ b/configure
@@ -152,7 +152,6 @@ FreeBSD)
 bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd pa"
-aio_lib="-lpthread"
 if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
     kqemu="yes"
 fi
@@ -162,7 +161,6 @@ bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lrt -lpthread"
 ;;
 OpenBSD)
 bsd="yes"
@@ -170,7 +168,6 @@ openbsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lpthread"
 ;;
 Darwin)
 bsd="yes"
@@ -181,7 +178,6 @@ audio_drv_list="coreaudio"
 audio_possible_drivers="coreaudio sdl fmod"
 OS_CFLAGS="-mdynamic-no-pic"
 OS_LDFLAGS="-framework CoreFoundation -framework IOKit"
-aio_lib="-lpthread"
 ;;
 SunOS)
     solaris="yes"
@@ -533,15 +529,6 @@ if test "$mingw32" = "yes" ; then
     bsd_user="no"
 fi
 
-if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then
-    AIOLIBS=
-elif [ "$bsd" = "yes" ]; then
-    AIOLIBS="$aio_lib"
-else
-    # Some Linux architectures (e.g. s390) don't imply -lpthread automatically.
-    AIOLIBS="-lrt -lpthread"
-fi
-
 # Check for gcc4, error if pre-gcc4
 if test "$check_gcc" = "yes" ; then
     cat > $TMPC <<EOF
@@ -1006,14 +993,17 @@ fi
 
 ##########################################
 # AIO probe
+AIOLIBS=""
+
 if test "$aio" = "yes" ; then
   aio=no
   cat > $TMPC << EOF
-#include <aio.h>
-int main(void) { return aio_write(NULL); }
+#include <pthread.h>
+int main(void) { pthread_mutex_t lock; return 0; }
 EOF
   if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then
     aio=yes
+    AIOLIBS="-lpthread"
   fi
 fi
 
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
new file mode 100644
index 0000000..c21d579
--- /dev/null
+++ b/posix-aio-compat.c
@@ -0,0 +1,202 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include "osdep.h"
+
+#include "posix-aio-compat.h"
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_t thread_id;
+static int max_threads = 64;
+static int cur_threads = 0;
+static int idle_threads = 0;
+static TAILQ_HEAD(, aiocb) request_list;
+
+static void *aio_thread(void *unused)
+{
+    sigset_t set;
+
+    /* block all signals */
+    sigfillset(&set);
+    sigprocmask(SIG_BLOCK, &set, NULL);
+
+    while (1) {
+        struct aiocb *aiocb;
+        size_t offset;
+        int ret = 0;
+
+        pthread_mutex_lock(&lock);
+
+        while (TAILQ_EMPTY(&request_list) &&
+               !(ret == ETIMEDOUT)) {
+            struct timespec ts = { 0 };
+            qemu_timeval tv;
+
+            qemu_gettimeofday(&tv);
+            ts.tv_sec = tv.tv_sec + 10;
+            ret = pthread_cond_timedwait(&cond, &lock, &ts);
+        }
+
+        if (ret == ETIMEDOUT)
+            break;
+
+        aiocb = TAILQ_FIRST(&request_list);
+        TAILQ_REMOVE(&request_list, aiocb, node);
+
+        offset = 0;
+        aiocb->active = 1;
+
+        idle_threads--;
+        pthread_mutex_unlock(&lock);
+
+        while (offset < aiocb->aio_nbytes) {
+            ssize_t len;
+
+            if (aiocb->is_write)
+                len = pwrite(aiocb->aio_fildes,
+                             (const char *)aiocb->aio_buf + offset,
+                             aiocb->aio_nbytes - offset,
+                             aiocb->aio_offset + offset);
+            else
+                len = pread(aiocb->aio_fildes,
+                            (char *)aiocb->aio_buf + offset,
+                            aiocb->aio_nbytes - offset,
+                            aiocb->aio_offset + offset);
+
+            if (len == -1 && errno == EINTR)
+                continue;
+            else if (len == -1) {
+                pthread_mutex_lock(&lock);
+                aiocb->ret = -errno;
+                pthread_mutex_unlock(&lock);
+                break;
+            } else if (len == 0)
+                break;
+
+            offset += len;
+
+            pthread_mutex_lock(&lock);
+            aiocb->ret = offset;
+            pthread_mutex_unlock(&lock);
+        }
+
+        pthread_mutex_lock(&lock);
+        idle_threads++;
+        pthread_mutex_unlock(&lock);
+
+        sigqueue(getpid(),
+                 aiocb->aio_sigevent.sigev_signo,
+                 aiocb->aio_sigevent.sigev_value);
+    }
+
+    idle_threads--;
+    cur_threads--;
+    pthread_mutex_unlock(&lock);
+
+    return NULL;
+}
+
+static int spawn_thread(void)
+{
+    pthread_attr_t attr;
+    int ret;
+
+    cur_threads++;
+    idle_threads++;
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    ret = pthread_create(&thread_id, &attr, aio_thread, NULL);
+    pthread_attr_destroy(&attr);
+
+    return ret;
+}
+
+int _compat_aio_init(struct aioinit *aioinit)
+{
+    TAILQ_INIT(&request_list);
+
+    return 0;
+}
+
+static int _compat_aio_submit(struct aiocb *aiocb, int is_write)
+{
+    aiocb->is_write = is_write;
+    aiocb->ret = -EINPROGRESS;
+    aiocb->active = 0;
+    pthread_mutex_lock(&lock);
+    if (idle_threads == 0 && cur_threads < max_threads)
+        spawn_thread();
+    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+    pthread_mutex_unlock(&lock);
+    pthread_cond_broadcast(&cond);
+
+    return 0;
+}
+                                  
+int _compat_aio_read(struct aiocb *aiocb)
+{
+    return _compat_aio_submit(aiocb, 0);
+}
+
+int _compat_aio_write(struct aiocb *aiocb)
+{
+    return _compat_aio_submit(aiocb, 1);
+}
+
+ssize_t _compat_aio_return(struct aiocb *aiocb)
+{
+    ssize_t ret;
+
+    pthread_mutex_lock(&lock);
+    ret = aiocb->ret;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
+int _compat_aio_error(struct aiocb *aiocb)
+{
+    ssize_t ret = _compat_aio_return(aiocb);
+
+    if (ret < 0)
+        ret = -ret;
+    else
+        ret = 0;
+
+    return ret;
+}
+
+int _compat_aio_cancel(int fd, struct aiocb *aiocb)
+{
+    int ret;
+
+    pthread_mutex_lock(&lock);
+    if (!aiocb->active) {
+        TAILQ_REMOVE(&request_list, aiocb, node);
+        aiocb->ret = -ECANCELED;
+        ret = AIO_CANCELED;
+    } else if (aiocb->ret == -EINPROGRESS)
+        ret = AIO_NOTCANCELED;
+    else
+        ret = AIO_ALLDONE;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
diff --git a/posix-aio-compat.h b/posix-aio-compat.h
new file mode 100644
index 0000000..c8fcb0e
--- /dev/null
+++ b/posix-aio-compat.h
@@ -0,0 +1,86 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_POSIX_AIO_COMPAT_H
+#define QEMU_POSIX_AIO_COMPAT_H
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "sys-queue.h"
+
+#define AIO_CANCELED     0x01
+#define AIO_NOTCANCELED  0x02
+#define AIO_ALLDONE      0x03
+
+struct aiocb
+{
+    int aio_fildes;
+    void *aio_buf;
+    size_t aio_nbytes;
+    struct sigevent aio_sigevent;
+    off_t aio_offset;
+
+    /* private */
+    TAILQ_ENTRY(aiocb) node;
+    int is_write;
+    ssize_t ret;
+    int active;
+};
+
+struct aioinit
+{
+    int aio_threads;
+    int aio_num;
+    int aio_idle_time;
+};
+
+int _compat_aio_init(struct aioinit *aioinit);
+int _compat_aio_read(struct aiocb *aiocb);
+int _compat_aio_write(struct aiocb *aiocb);
+int _compat_aio_error(struct aiocb *aiocb);
+ssize_t _compat_aio_return(struct aiocb *aiocb);
+int _compat_aio_cancel(int fd, struct aiocb *aiocb);
+
+static inline int aio_init(struct aioinit *aioinit)
+{
+    return _compat_aio_init(aioinit);
+}
+
+static inline int aio_read(struct aiocb *aiocb)
+{
+    return _compat_aio_read(aiocb);
+}
+
+static inline int aio_write(struct aiocb *aiocb)
+{
+    return _compat_aio_write(aiocb);
+}
+
+static inline int aio_error(struct aiocb *aiocb)
+{
+    return _compat_aio_error(aiocb);
+}
+
+static inline ssize_t aio_return(struct aiocb *aiocb)
+{
+    return _compat_aio_return(aiocb);
+}
+
+static inline int aio_cancel(int fd, struct aiocb *aiocb)
+{
+    return _compat_aio_cancel(fd, aiocb);
+}
+
+#endif

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-05 21:21 ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-05 21:21 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

glibc implements posix-aio as a thread pool and imposes a number of limitations.

1) it limits one request per-file descriptor.  we hack around this by dup()'ing
file descriptors which is hideously ugly

2) it's impossible to add new interfaces and we need a vectored read/write
operation to properly support a zero-copy API.

What has been suggested to me by glibc folks, is to implement whatever new
interfaces we want and then it can eventually be proposed for standardization.
This requires that we implement our own posix-aio implementation though.

This patch implements posix-aio using pthreads.  It immediately eliminates the
need for fd pooling.

It performs at least as well as the current posix-aio code (in some
circumstances, even better).

My only concern here is non-Linux Unices like FreeBSD.  They have kernel support
for posix-aio.  Since we cannot extend those interfaces though, I think that
even on those platforms we should still use a thread pool.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile b/Makefile
index 76470a4..030a6ae 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o
 ifdef CONFIG_WIN32
 BLOCK_OBJS += block-raw-win32.o
 else
+ifdef CONFIG_AIO
+BLOCK_OBJS += posix-aio-compat.o
+endif
 BLOCK_OBJS += block-raw-posix.o
 endif
 
diff --git a/Makefile.target b/Makefile.target
index 671d72a..32dfb85 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -595,6 +595,9 @@ endif
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
+ifdef CONFIG_AIO
+OBJS+=posix-aio-compat.o
+endif
 OBJS+=block-raw-posix.o
 endif
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 0a06a12..74b875a 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -27,7 +27,7 @@
 #include "block_int.h"
 #include <assert.h>
 #ifdef CONFIG_AIO
-#include <aio.h>
+#include "posix-aio-compat.h"
 #endif
 
 #ifdef CONFIG_COCOA
@@ -93,16 +93,10 @@
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
-/* posix-aio doesn't allow multiple outstanding requests to a single file
- * descriptor.  we implement a pool of dup()'d file descriptors to work
- * around this */
-#define RAW_FD_POOL_SIZE	64
-
 typedef struct BDRVRawState {
     int fd;
     int type;
     unsigned int lseek_err_cnt;
-    int fd_pool[RAW_FD_POOL_SIZE];
 #if defined(__linux__)
     /* linux floppy specific */
     int fd_open_flags;
@@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
-    int i;
 
     posix_aio_init();
 
@@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
     s->aligned_buf = NULL;
     if ((flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
@@ -446,7 +437,6 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
 
 typedef struct RawAIOCB {
     BlockDriverAIOCB common;
-    int fd;
     struct aiocb aiocb;
     struct RawAIOCB *next;
     int ret;
@@ -458,38 +448,6 @@ typedef struct PosixAioState
     RawAIOCB *first_aio;
 } PosixAioState;
 
-static int raw_fd_pool_get(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        /* already in use */
-        if (s->fd_pool[i] != -1)
-            continue;
-
-        /* try to dup file descriptor */
-        s->fd_pool[i] = dup(s->fd);
-        if (s->fd_pool[i] != -1)
-            return s->fd_pool[i];
-    }
-
-    /* we couldn't dup the file descriptor so just use the main one */
-    return s->fd;
-}
-
-static void raw_fd_pool_put(RawAIOCB *acb)
-{
-    BDRVRawState *s = acb->common.bs->opaque;
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] == acb->fd) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
-
 static void posix_aio_read(void *opaque)
 {
     PosixAioState *s = opaque;
@@ -519,7 +477,6 @@ static void posix_aio_read(void *opaque)
             if (ret == ECANCELED) {
                 /* remove the request */
                 *pacb = acb->next;
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
             } else if (ret != EINPROGRESS) {
                 /* end of aio */
@@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque)
                 *pacb = acb->next;
                 /* call the callback */
                 acb->common.cb(acb->common.opaque, ret);
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
                 break;
             } else {
@@ -571,6 +527,7 @@ static int posix_aio_init(void)
     struct sigaction act;
     PosixAioState *s;
     int fds[2];
+    struct aioinit ai;
   
     if (posix_aio_state)
         return 0;
@@ -598,24 +555,11 @@ static int posix_aio_init(void)
 
     qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
 
-#if defined(__linux__)
-    {
-        struct aioinit ai;
+    memset(&ai, 0, sizeof(ai));
+    ai.aio_threads = 64;
+    ai.aio_num = 64;
+    aio_init(&ai);
 
-        memset(&ai, 0, sizeof(ai));
-#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4)
-        ai.aio_threads = 64;
-        ai.aio_num = 64;
-#else
-        /* XXX: aio thread exit seems to hang on RedHat 9 and this init
-           seems to fix the problem. */
-        ai.aio_threads = 1;
-        ai.aio_num = 1;
-        ai.aio_idle_time = 365 * 100000;
-#endif
-        aio_init(&ai);
-    }
-#endif
     posix_aio_state = s;
 
     return 0;
@@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->fd = raw_fd_pool_get(s);
-    acb->aiocb.aio_fildes = acb->fd;
+    acb->aiocb.aio_fildes = s->fd;
     acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
             break;
         } else if (*pacb == acb) {
             *pacb = acb->next;
-            raw_fd_pool_put(acb);
             qemu_aio_release(acb);
             break;
         }
         pacb = &acb->next;
     }
 }
-
 #else /* CONFIG_AIO */
 static int posix_aio_init(void)
 {
@@ -753,17 +694,6 @@ static int posix_aio_init(void)
 }
 #endif /* CONFIG_AIO */
 
-static void raw_close_fd_pool(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] != -1) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
 
 static void raw_close(BlockDriverState *bs)
 {
@@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs)
         if (s->aligned_buf != NULL)
             qemu_free(s->aligned_buf);
     }
-    raw_close_fd_pool(s);
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -895,6 +824,7 @@ BlockDriver bdrv_raw = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_truncate = raw_truncate,
@@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
-    int fd, open_flags, ret, i;
+    int fd, open_flags, ret;
 
     posix_aio_init();
 
@@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
 #if defined(__linux__)
     /* close fd so that we can reopen it as needed */
     if (s->type == FTYPE_FD) {
@@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs)
         (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         close(s->fd);
         s->fd = -1;
-        raw_close_fd_pool(s);
 #ifdef DEBUG_FLOPPY
         printf("Floppy closed\n");
 #endif
@@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
             if (s->fd >= 0) {
                 close(s->fd);
                 s->fd = -1;
-                raw_close_fd_pool(s);
             }
             fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
             if (fd >= 0) {
@@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_getlength = raw_getlength,
diff --git a/configure b/configure
index 7f82786..146b3fc 100755
--- a/configure
+++ b/configure
@@ -152,7 +152,6 @@ FreeBSD)
 bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd pa"
-aio_lib="-lpthread"
 if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
     kqemu="yes"
 fi
@@ -162,7 +161,6 @@ bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lrt -lpthread"
 ;;
 OpenBSD)
 bsd="yes"
@@ -170,7 +168,6 @@ openbsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lpthread"
 ;;
 Darwin)
 bsd="yes"
@@ -181,7 +178,6 @@ audio_drv_list="coreaudio"
 audio_possible_drivers="coreaudio sdl fmod"
 OS_CFLAGS="-mdynamic-no-pic"
 OS_LDFLAGS="-framework CoreFoundation -framework IOKit"
-aio_lib="-lpthread"
 ;;
 SunOS)
     solaris="yes"
@@ -533,15 +529,6 @@ if test "$mingw32" = "yes" ; then
     bsd_user="no"
 fi
 
-if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then
-    AIOLIBS=
-elif [ "$bsd" = "yes" ]; then
-    AIOLIBS="$aio_lib"
-else
-    # Some Linux architectures (e.g. s390) don't imply -lpthread automatically.
-    AIOLIBS="-lrt -lpthread"
-fi
-
 # Check for gcc4, error if pre-gcc4
 if test "$check_gcc" = "yes" ; then
     cat > $TMPC <<EOF
@@ -1006,14 +993,17 @@ fi
 
 ##########################################
 # AIO probe
+AIOLIBS=""
+
 if test "$aio" = "yes" ; then
   aio=no
   cat > $TMPC << EOF
-#include <aio.h>
-int main(void) { return aio_write(NULL); }
+#include <pthread.h>
+int main(void) { pthread_mutex_t lock; return 0; }
 EOF
   if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then
     aio=yes
+    AIOLIBS="-lpthread"
   fi
 fi
 
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
new file mode 100644
index 0000000..c21d579
--- /dev/null
+++ b/posix-aio-compat.c
@@ -0,0 +1,202 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include "osdep.h"
+
+#include "posix-aio-compat.h"
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_t thread_id;
+static int max_threads = 64;
+static int cur_threads = 0;
+static int idle_threads = 0;
+static TAILQ_HEAD(, aiocb) request_list;
+
+static void *aio_thread(void *unused)
+{
+    sigset_t set;
+
+    /* block all signals */
+    sigfillset(&set);
+    sigprocmask(SIG_BLOCK, &set, NULL);
+
+    while (1) {
+        struct aiocb *aiocb;
+        size_t offset;
+        int ret = 0;
+
+        pthread_mutex_lock(&lock);
+
+        while (TAILQ_EMPTY(&request_list) &&
+               !(ret == ETIMEDOUT)) {
+            struct timespec ts = { 0 };
+            qemu_timeval tv;
+
+            qemu_gettimeofday(&tv);
+            ts.tv_sec = tv.tv_sec + 10;
+            ret = pthread_cond_timedwait(&cond, &lock, &ts);
+        }
+
+        if (ret == ETIMEDOUT)
+            break;
+
+        aiocb = TAILQ_FIRST(&request_list);
+        TAILQ_REMOVE(&request_list, aiocb, node);
+
+        offset = 0;
+        aiocb->active = 1;
+
+        idle_threads--;
+        pthread_mutex_unlock(&lock);
+
+        while (offset < aiocb->aio_nbytes) {
+            ssize_t len;
+
+            if (aiocb->is_write)
+                len = pwrite(aiocb->aio_fildes,
+                             (const char *)aiocb->aio_buf + offset,
+                             aiocb->aio_nbytes - offset,
+                             aiocb->aio_offset + offset);
+            else
+                len = pread(aiocb->aio_fildes,
+                            (char *)aiocb->aio_buf + offset,
+                            aiocb->aio_nbytes - offset,
+                            aiocb->aio_offset + offset);
+
+            if (len == -1 && errno == EINTR)
+                continue;
+            else if (len == -1) {
+                pthread_mutex_lock(&lock);
+                aiocb->ret = -errno;
+                pthread_mutex_unlock(&lock);
+                break;
+            } else if (len == 0)
+                break;
+
+            offset += len;
+
+            pthread_mutex_lock(&lock);
+            aiocb->ret = offset;
+            pthread_mutex_unlock(&lock);
+        }
+
+        pthread_mutex_lock(&lock);
+        idle_threads++;
+        pthread_mutex_unlock(&lock);
+
+        sigqueue(getpid(),
+                 aiocb->aio_sigevent.sigev_signo,
+                 aiocb->aio_sigevent.sigev_value);
+    }
+
+    idle_threads--;
+    cur_threads--;
+    pthread_mutex_unlock(&lock);
+
+    return NULL;
+}
+
+static int spawn_thread(void)
+{
+    pthread_attr_t attr;
+    int ret;
+
+    cur_threads++;
+    idle_threads++;
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    ret = pthread_create(&thread_id, &attr, aio_thread, NULL);
+    pthread_attr_destroy(&attr);
+
+    return ret;
+}
+
+int _compat_aio_init(struct aioinit *aioinit)
+{
+    TAILQ_INIT(&request_list);
+
+    return 0;
+}
+
+static int _compat_aio_submit(struct aiocb *aiocb, int is_write)
+{
+    aiocb->is_write = is_write;
+    aiocb->ret = -EINPROGRESS;
+    aiocb->active = 0;
+    pthread_mutex_lock(&lock);
+    if (idle_threads == 0 && cur_threads < max_threads)
+        spawn_thread();
+    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+    pthread_mutex_unlock(&lock);
+    pthread_cond_broadcast(&cond);
+
+    return 0;
+}
+                                  
+int _compat_aio_read(struct aiocb *aiocb)
+{
+    return _compat_aio_submit(aiocb, 0);
+}
+
+int _compat_aio_write(struct aiocb *aiocb)
+{
+    return _compat_aio_submit(aiocb, 1);
+}
+
+ssize_t _compat_aio_return(struct aiocb *aiocb)
+{
+    ssize_t ret;
+
+    pthread_mutex_lock(&lock);
+    ret = aiocb->ret;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
+int _compat_aio_error(struct aiocb *aiocb)
+{
+    ssize_t ret = _compat_aio_return(aiocb);
+
+    if (ret < 0)
+        ret = -ret;
+    else
+        ret = 0;
+
+    return ret;
+}
+
+int _compat_aio_cancel(int fd, struct aiocb *aiocb)
+{
+    int ret;
+
+    pthread_mutex_lock(&lock);
+    if (!aiocb->active) {
+        TAILQ_REMOVE(&request_list, aiocb, node);
+        aiocb->ret = -ECANCELED;
+        ret = AIO_CANCELED;
+    } else if (aiocb->ret == -EINPROGRESS)
+        ret = AIO_NOTCANCELED;
+    else
+        ret = AIO_ALLDONE;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
diff --git a/posix-aio-compat.h b/posix-aio-compat.h
new file mode 100644
index 0000000..c8fcb0e
--- /dev/null
+++ b/posix-aio-compat.h
@@ -0,0 +1,86 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_POSIX_AIO_COMPAT_H
+#define QEMU_POSIX_AIO_COMPAT_H
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "sys-queue.h"
+
+#define AIO_CANCELED     0x01
+#define AIO_NOTCANCELED  0x02
+#define AIO_ALLDONE      0x03
+
+struct aiocb
+{
+    int aio_fildes;
+    void *aio_buf;
+    size_t aio_nbytes;
+    struct sigevent aio_sigevent;
+    off_t aio_offset;
+
+    /* private */
+    TAILQ_ENTRY(aiocb) node;
+    int is_write;
+    ssize_t ret;
+    int active;
+};
+
+struct aioinit
+{
+    int aio_threads;
+    int aio_num;
+    int aio_idle_time;
+};
+
+int _compat_aio_init(struct aioinit *aioinit);
+int _compat_aio_read(struct aiocb *aiocb);
+int _compat_aio_write(struct aiocb *aiocb);
+int _compat_aio_error(struct aiocb *aiocb);
+ssize_t _compat_aio_return(struct aiocb *aiocb);
+int _compat_aio_cancel(int fd, struct aiocb *aiocb);
+
+static inline int aio_init(struct aioinit *aioinit)
+{
+    return _compat_aio_init(aioinit);
+}
+
+static inline int aio_read(struct aiocb *aiocb)
+{
+    return _compat_aio_read(aiocb);
+}
+
+static inline int aio_write(struct aiocb *aiocb)
+{
+    return _compat_aio_write(aiocb);
+}
+
+static inline int aio_error(struct aiocb *aiocb)
+{
+    return _compat_aio_error(aiocb);
+}
+
+static inline ssize_t aio_return(struct aiocb *aiocb)
+{
+    return _compat_aio_return(aiocb);
+}
+
+static inline int aio_cancel(int fd, struct aiocb *aiocb)
+{
+    return _compat_aio_cancel(fd, aiocb);
+}
+
+#endif

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-05 21:21 ` [Qemu-devel] " Anthony Liguori
  (?)
@ 2008-12-06  9:03 ` Blue Swirl
  2008-12-06 18:26   ` Jamie Lokier
  2008-12-08 18:23     ` Anthony Liguori
  -1 siblings, 2 replies; 75+ messages in thread
From: Blue Swirl @ 2008-12-06  9:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

On 12/5/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> glibc implements posix-aio as a thread pool and imposes a number of limitations.
>
>  1) it limits one request per-file descriptor.  we hack around this by dup()'ing
>  file descriptors which is hideously ugly
>
>  2) it's impossible to add new interfaces and we need a vectored read/write
>  operation to properly support a zero-copy API.
>
>  What has been suggested to me by glibc folks, is to implement whatever new
>  interfaces we want and then it can eventually be proposed for standardization.
>  This requires that we implement our own posix-aio implementation though.
>
>  This patch implements posix-aio using pthreads.  It immediately eliminates the
>  need for fd pooling.
>
>  It performs at least as well as the current posix-aio code (in some
>  circumstances, even better).
>
>  My only concern here is non-Linux Unices like FreeBSD.  They have kernel support
>  for posix-aio.  Since we cannot extend those interfaces though, I think that
>  even on those platforms we should still use a thread pool.
>
>  Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

>  @@ -895,6 +824,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_aio_cancel = raw_aio_cancel,
>      .aiocb_size = sizeof(RawAIOCB),
>   #endif
>  +
>  @@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = {
>      .bdrv_aio_cancel = raw_aio_cancel,
>      .aiocb_size = sizeof(RawAIOCB),
>   #endif
>  +

Some cleanup needed here?

>  +int _compat_aio_init(struct aioinit *aioinit)
>  +static int _compat_aio_submit(struct aiocb *aiocb, int is_write)
>  +int _compat_aio_read(struct aiocb *aiocb)
>  +int _compat_aio_write(struct aiocb *aiocb)
>  +ssize_t _compat_aio_return(struct aiocb *aiocb)
>  +int _compat_aio_error(struct aiocb *aiocb)
>  +int _compat_aio_cancel(int fd, struct aiocb *aiocb)

The names should not begin with an underscore.

>  +struct aiocb
>  +{
>  +    int aio_fildes;
>  +    void *aio_buf;
>  +    size_t aio_nbytes;
>  +    struct sigevent aio_sigevent;
>  +    off_t aio_offset;
>  +
>  +    /* private */
>  +    TAILQ_ENTRY(aiocb) node;
>  +    int is_write;
>  +    ssize_t ret;
>  +    int active;
>  +};
>  +
>  +struct aioinit
>  +{
>  +    int aio_threads;
>  +    int aio_num;
>  +    int aio_idle_time;
>  +};

These structs should probably be named qemu_aiocb and qemu_aioinit to
avoid conflict with system types.

I like to use unsigned types whenever possible, IIRC compilers may
generate better code with those.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-06  9:03 ` Blue Swirl
@ 2008-12-06 18:26   ` Jamie Lokier
  2008-12-08 18:23     ` Anthony Liguori
  1 sibling, 0 replies; 75+ messages in thread
From: Jamie Lokier @ 2008-12-06 18:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

Blue Swirl wrote:
> I like to use unsigned types whenever possible, IIRC compilers may
> generate better code with those.

On the one hand, unsigned division can sometimes be compiled faster,
depends on the architecture's div instructions.

On the other hand, signed operations can be optimised by GCC with
-fstrict-overflow, while the same unsigned operations cannot be so optimised.

-- Jamie

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-06  9:03 ` Blue Swirl
@ 2008-12-08 18:23     ` Anthony Liguori
  2008-12-08 18:23     ` Anthony Liguori
  1 sibling, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-08 18:23 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel, Anthony Liguori, kvm-devel

Blue Swirl wrote:
> On 12/5/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
> Some cleanup needed here?
>   

Hrm, yeah.

>>  +int _compat_aio_init(struct aioinit *aioinit)
>>  +static int _compat_aio_submit(struct aiocb *aiocb, int is_write)
>>  +int _compat_aio_read(struct aiocb *aiocb)
>>  +int _compat_aio_write(struct aiocb *aiocb)
>>  +ssize_t _compat_aio_return(struct aiocb *aiocb)
>>  +int _compat_aio_error(struct aiocb *aiocb)
>>  +int _compat_aio_cancel(int fd, struct aiocb *aiocb)
>>     
>
> The names should not begin with an underscore.
>   

This okay by C99, if we're going to rename the aioinit structure, we 
might as well just rename it all to qemu_aio_XX or something like that.

>>  +struct aiocb
>>  +{
>>  +    int aio_fildes;
>>  +    void *aio_buf;
>>  +    size_t aio_nbytes;
>>  +    struct sigevent aio_sigevent;
>>  +    off_t aio_offset;
>>  +
>>  +    /* private */
>>  +    TAILQ_ENTRY(aiocb) node;
>>  +    int is_write;
>>  +    ssize_t ret;
>>  +    int active;
>>  +};
>>  +
>>  +struct aioinit
>>  +{
>>  +    int aio_threads;
>>  +    int aio_num;
>>  +    int aio_idle_time;
>>  +};
>>     
>
> These structs should probably be named qemu_aiocb and qemu_aioinit to
> avoid conflict with system types.
>
> I like to use unsigned types whenever possible, IIRC compilers may
> generate better code with those.
>   

Yeah, I don't disagree.  I was trying to maintain glibc compatibility.  
That's not strictly necessary though.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-08 18:23     ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-08 18:23 UTC (permalink / raw)
  To: Blue Swirl; +Cc: Anthony Liguori, qemu-devel, kvm-devel

Blue Swirl wrote:
> On 12/5/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
> Some cleanup needed here?
>   

Hrm, yeah.

>>  +int _compat_aio_init(struct aioinit *aioinit)
>>  +static int _compat_aio_submit(struct aiocb *aiocb, int is_write)
>>  +int _compat_aio_read(struct aiocb *aiocb)
>>  +int _compat_aio_write(struct aiocb *aiocb)
>>  +ssize_t _compat_aio_return(struct aiocb *aiocb)
>>  +int _compat_aio_error(struct aiocb *aiocb)
>>  +int _compat_aio_cancel(int fd, struct aiocb *aiocb)
>>     
>
> The names should not begin with an underscore.
>   

This okay by C99, if we're going to rename the aioinit structure, we 
might as well just rename it all to qemu_aio_XX or something like that.

>>  +struct aiocb
>>  +{
>>  +    int aio_fildes;
>>  +    void *aio_buf;
>>  +    size_t aio_nbytes;
>>  +    struct sigevent aio_sigevent;
>>  +    off_t aio_offset;
>>  +
>>  +    /* private */
>>  +    TAILQ_ENTRY(aiocb) node;
>>  +    int is_write;
>>  +    ssize_t ret;
>>  +    int active;
>>  +};
>>  +
>>  +struct aioinit
>>  +{
>>  +    int aio_threads;
>>  +    int aio_num;
>>  +    int aio_idle_time;
>>  +};
>>     
>
> These structs should probably be named qemu_aiocb and qemu_aioinit to
> avoid conflict with system types.
>
> I like to use unsigned types whenever possible, IIRC compilers may
> generate better code with those.
>   

Yeah, I don't disagree.  I was trying to maintain glibc compatibility.  
That's not strictly necessary though.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-05 21:21 ` [Qemu-devel] " Anthony Liguori
  (?)
  (?)
@ 2008-12-09 15:51 ` Gerd Hoffmann
  2008-12-09 16:01   ` Anthony Liguori
  2008-12-09 17:16   ` Avi Kivity
  -1 siblings, 2 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-09 15:51 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

  Hi,

> 2) it's impossible to add new interfaces and we need a vectored read/write
> operation to properly support a zero-copy API.

I'm eager to try vectored block ops for the xenbus block backend.

> It performs at least as well as the current posix-aio code (in some
> circumstances, even better).

Well, I see a massive slowdown when switching from sync to aio in the
xen backend code.  I think the reason is that due to the lack of a
vectored interface (and thus /me submitting separate aio requests for
each iovec element) stuff gets parallelized *way* too much and disk seek
times are killing me.

> My only concern here is non-Linux Unices like FreeBSD.  They have kernel support
> for posix-aio.  Since we cannot extend those interfaces though, I think that
> even on those platforms we should still use a thread pool.

Which might change some day in the future when we manage to get iovec
support into posix-aio specs.

I think the interface should use qemu-prefixed function and struct
names.  The we can trivially map them to a system-provided aio
implementation without worrying about name clashes.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-09 15:51 ` Gerd Hoffmann
@ 2008-12-09 16:01   ` Anthony Liguori
  2008-12-10 16:44     ` Andrea Arcangeli
  2008-12-09 17:16   ` Avi Kivity
  1 sibling, 1 reply; 75+ messages in thread
From: Anthony Liguori @ 2008-12-09 16:01 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: qemu-devel, kvm-devel

Gerd Hoffmann wrote:
>   Hi,
>
>   
>> 2) it's impossible to add new interfaces and we need a vectored read/write
>> operation to properly support a zero-copy API.
>>     
>
> I'm eager to try vectored block ops for the xenbus block backend.
>
>   
>> It performs at least as well as the current posix-aio code (in some
>> circumstances, even better).
>>     
>
> Well, I see a massive slowdown when switching from sync to aio in the
> xen backend code.  I think the reason is that due to the lack of a
> vectored interface (and thus /me submitting separate aio requests for
> each iovec element) stuff gets parallelized *way* too much and disk seek
> times are killing me.
>   

Yup.  And I'm also adding vectored IO operations.  Patches for that 
should show up in the next day or so.  We experienced the same problem 
with virtio-blk FWIW.

>> My only concern here is non-Linux Unices like FreeBSD.  They have kernel support
>> for posix-aio.  Since we cannot extend those interfaces though, I think that
>> even on those platforms we should still use a thread pool.
>>     
>
> Which might change some day in the future when we manage to get iovec
> support into posix-aio specs.
>
> I think the interface should use qemu-prefixed function and struct
> names.  The we can trivially map them to a system-provided aio
> implementation without worrying about name clashes.
>   

Yes, that's what I'm going to do before committing it.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-09 15:51 ` Gerd Hoffmann
  2008-12-09 16:01   ` Anthony Liguori
@ 2008-12-09 17:16   ` Avi Kivity
  1 sibling, 0 replies; 75+ messages in thread
From: Avi Kivity @ 2008-12-09 17:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

Gerd Hoffmann wrote:
> Well, I see a massive slowdown when switching from sync to aio in the
> xen backend code.  I think the reason is that due to the lack of a
> vectored interface (and thus /me submitting separate aio requests for
> each iovec element) stuff gets parallelized *way* too much and disk seek
> times are killing me.
>   

Try switching to the deadline io scheduler.  cfq has a known issue with 
this.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-09 16:01   ` Anthony Liguori
@ 2008-12-10 16:44     ` Andrea Arcangeli
  2008-12-10 17:21         ` Anthony Liguori
  0 siblings, 1 reply; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-10 16:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, kvm-devel

On Tue, Dec 09, 2008 at 10:01:34AM -0600, Anthony Liguori wrote:
> Yes, that's what I'm going to do before committing it.

I've been hacking on this too, just to push out a full tested patchset
without the _em badness... problem is if we use more than one thread,
there's a thread race between lseek and writev, pread/pwrite don't
have the issue as they store the pos on the kernel stack, and they
don't pass through the shared file->f_pos. We'd really need
preadv/pwritev...

To solve this in userland without kernel aio we'd need to open (not
just dup) the file in each thread, then the file->f_pos will become
thread local and we can cache the last lseek value and avoid the lseek
syscall for contiguous I/O. Or we need to reduce the number of threads
to 1 per fd (screwing seeking I/O). kernel aio wouldn't have this
trouble and a single fd/file would be enough, but that would only work
on linux.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 16:44     ` Andrea Arcangeli
@ 2008-12-10 17:21         ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-10 17:21 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: qemu-devel, Gerd Hoffmann, kvm-devel

Andrea Arcangeli wrote:
> On Tue, Dec 09, 2008 at 10:01:34AM -0600, Anthony Liguori wrote:
>   
>> Yes, that's what I'm going to do before committing it.
>>     
>
> I've been hacking on this too, just to push out a full tested patchset
> without the _em badness... problem is if we use more than one thread,
> there's a thread race between lseek and writev, pread/pwrite don't
> have the issue as they store the pos on the kernel stack, and they
> don't pass through the shared file->f_pos. We'd really need
> preadv/pwritev...
>
> To solve this in userland without kernel aio we'd need to open (not
> just dup)

Why not just dup?  I've implemented this and it seems to work.

>  the file in each thread, then the file->f_pos will become
> thread local and we can cache the last lseek value and avoid the lseek
> syscall for contiguous I/O. Or we need to reduce the number of threads
> to 1 per fd (screwing seeking I/O). kernel aio wouldn't have this
> trouble and a single fd/file would be enough, but that would only work
> on linux.
>   

I've also done a preadv/pwritev implementation in userspace using linux-aio.

My current plan is to finish the refactoring, then test out each 
implementation (dup() + lseek + readv vs. linux-aio) to see which one 
performs better.  If they're equal, dup() wins out because it's more 
portable.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-10 17:21         ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-10 17:21 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: qemu-devel, kvm-devel, Gerd Hoffmann

Andrea Arcangeli wrote:
> On Tue, Dec 09, 2008 at 10:01:34AM -0600, Anthony Liguori wrote:
>   
>> Yes, that's what I'm going to do before committing it.
>>     
>
> I've been hacking on this too, just to push out a full tested patchset
> without the _em badness... problem is if we use more than one thread,
> there's a thread race between lseek and writev, pread/pwrite don't
> have the issue as they store the pos on the kernel stack, and they
> don't pass through the shared file->f_pos. We'd really need
> preadv/pwritev...
>
> To solve this in userland without kernel aio we'd need to open (not
> just dup)

Why not just dup?  I've implemented this and it seems to work.

>  the file in each thread, then the file->f_pos will become
> thread local and we can cache the last lseek value and avoid the lseek
> syscall for contiguous I/O. Or we need to reduce the number of threads
> to 1 per fd (screwing seeking I/O). kernel aio wouldn't have this
> trouble and a single fd/file would be enough, but that would only work
> on linux.
>   

I've also done a preadv/pwritev implementation in userspace using linux-aio.

My current plan is to finish the refactoring, then test out each 
implementation (dup() + lseek + readv vs. linux-aio) to see which one 
performs better.  If they're equal, dup() wins out because it's more 
portable.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 17:21         ` Anthony Liguori
  (?)
@ 2008-12-10 17:29         ` Gerd Hoffmann
  2008-12-10 18:50           ` Anthony Liguori
  -1 siblings, 1 reply; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-10 17:29 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel

>> To solve this in userland without kernel aio we'd need to open (not
>> just dup)
> 
> Why not just dup?  I've implemented this and it seems to work.

unix keeps the file pointer in the (global) file table.  The
(per-process) file descriptor table references the file table.

opening twice gives you two file descriptor table entries referencing
two file table entries.  duping gives you two file descriptors
referencing the *same* file table entry.  Thus the two fds share the
file pointer.

HTH,
  Gerd



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 17:29         ` Gerd Hoffmann
@ 2008-12-10 18:50           ` Anthony Liguori
  2008-12-10 19:08               ` Andrea Arcangeli
  0 siblings, 1 reply; 75+ messages in thread
From: Anthony Liguori @ 2008-12-10 18:50 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel

Gerd Hoffmann wrote:
>>> To solve this in userland without kernel aio we'd need to open (not
>>> just dup)
>>>       
>> Why not just dup?  I've implemented this and it seems to work.
>>     
>
> unix keeps the file pointer in the (global) file table.  The
> (per-process) file descriptor table references the file table.
>
> opening twice gives you two file descriptor table entries referencing
> two file table entries.  duping gives you two file descriptors
> referencing the *same* file table entry.  Thus the two fds share the
> file pointer.
>   

But opening twice means that you lose coherency with NFS.

I hate Unix.

Regards,

Anthony Liguori

> HTH,
>   Gerd
>
>
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 18:50           ` Anthony Liguori
@ 2008-12-10 19:08               ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-10 19:08 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
> But opening twice means that you lose coherency with NFS.

Not sure why. They're not running from different nfs clients. If this
really isn't feasible, other ways to go would be to stick with a
single thread and add kernel aio to fix seeking I/O bandwidth (so cfq
indexing the bucket by the thread id will also work ok).

Alternatively we can support readv/writev only with kernel aio, no
threads, and we emulate it with a flood of pread/pwrite on the other
host os plus we set DEBUG_BOUNCE where readv/writev is emulated. But
then linux will be the only host OS benefiting from zerocopy.

If we never want to add kernel aio, we can also add preadv/pwritev to
linux but it remains a linux-only solution, so going with kernel aio
for a linux-only solution is surely better than the workaround with
userland threads plus preadv/pwritev. Still Al or somebody should add
preadv/pwritev, those are needed and useful as shown here.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-10 19:08               ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-10 19:08 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
> But opening twice means that you lose coherency with NFS.

Not sure why. They're not running from different nfs clients. If this
really isn't feasible, other ways to go would be to stick with a
single thread and add kernel aio to fix seeking I/O bandwidth (so cfq
indexing the bucket by the thread id will also work ok).

Alternatively we can support readv/writev only with kernel aio, no
threads, and we emulate it with a flood of pread/pwrite on the other
host os plus we set DEBUG_BOUNCE where readv/writev is emulated. But
then linux will be the only host OS benefiting from zerocopy.

If we never want to add kernel aio, we can also add preadv/pwritev to
linux but it remains a linux-only solution, so going with kernel aio
for a linux-only solution is surely better than the workaround with
userland threads plus preadv/pwritev. Still Al or somebody should add
preadv/pwritev, those are needed and useful as shown here.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 19:08               ` Andrea Arcangeli
  (?)
@ 2008-12-11 13:12               ` Andrea Arcangeli
  2008-12-11 15:24                   ` Gerd Hoffmann
  -1 siblings, 1 reply; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 13:12 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

My current feeling is that this user thread aio thing will never
satisfy enterprise usage and kernel aio is mandatory in my view. I had
the same feeling before too, but I thought clone aio was desiderable
as intermediate step, because it could help whatever other unix host
OS that may not have native aio support. But if there's a problem with
opening the file multiple times (which btw is limiting the total
number of bdev to a dozen on a default ulimit -n with 64 max threads,
but it's probably ok), then we could as well stick to glibc aio, and
perhaps wait it to evolve with aio_readv/writev (probably backed by a
preadv/pwritev). And we should concentrate on kernel aio and get rid
of threads when host OS is linux. We can add a dependency where the
dma api will not bounce and linearize the buffer, only if the host
backend supports native aio.

Has anybody a patch implementing kernel aio that I can plug into the
dma zerocopy api? I'm not so sure clone aio is worth maintaining
inside qemu instead of evolving glibc and kernel with preadv/pwritev
for the long term.

Thanks!

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 13:12               ` Andrea Arcangeli
@ 2008-12-11 15:24                   ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 15:24 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> My current feeling is that this user thread aio thing will never
> satisfy enterprise usage and kernel aio is mandatory in my view.

Well, linux kernel aio has its share of problems too:

  * Anthony mentioned it may block on certain circumstances (forgot
    which ones), and you can't figure beforehand to turn off aio then.

  * It can't handle block allocation.  Kernel handles that by doing
    such writes synchronously via VFS layer (instead of the separate
    aio code paths).  Leads to horrible performance and bug reports
    such as "installs on sparse files are very slow".

  * support for vectored aio isn't that old.  IIRC it was added
    somewhen around 2.6.20 (newer that current suse/redhat enterprise
    versions).  Which IMHO means you can't expect it being present
    unconditionally.

> And we should concentrate on kernel aio and get rid
> of threads when host OS is linux.

Threads will be there anyway for kvm smp.

> Has anybody a patch implementing kernel aio that I can plug into the
> dma zerocopy api? I'm not so sure clone aio is worth maintaining
> inside qemu instead of evolving glibc

Well, wait for glibc isn't going to fly.  glibc waits for posix, and
posix waits for a reference implementation (which will not be glibc).

> and kernel with preadv/pwritev

With that in place you don't need kernel aio any more, then you can
really do it in userspace with threads.  But that probably would be
linux-only  ^W^W^W

ahem: http://www.daemon-systems.org/man/preadv.2.html

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 15:24                   ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 15:24 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> My current feeling is that this user thread aio thing will never
> satisfy enterprise usage and kernel aio is mandatory in my view.

Well, linux kernel aio has its share of problems too:

  * Anthony mentioned it may block on certain circumstances (forgot
    which ones), and you can't figure beforehand to turn off aio then.

  * It can't handle block allocation.  Kernel handles that by doing
    such writes synchronously via VFS layer (instead of the separate
    aio code paths).  Leads to horrible performance and bug reports
    such as "installs on sparse files are very slow".

  * support for vectored aio isn't that old.  IIRC it was added
    somewhen around 2.6.20 (newer that current suse/redhat enterprise
    versions).  Which IMHO means you can't expect it being present
    unconditionally.

> And we should concentrate on kernel aio and get rid
> of threads when host OS is linux.

Threads will be there anyway for kvm smp.

> Has anybody a patch implementing kernel aio that I can plug into the
> dma zerocopy api? I'm not so sure clone aio is worth maintaining
> inside qemu instead of evolving glibc

Well, wait for glibc isn't going to fly.  glibc waits for posix, and
posix waits for a reference implementation (which will not be glibc).

> and kernel with preadv/pwritev

With that in place you don't need kernel aio any more, then you can
really do it in userspace with threads.  But that probably would be
linux-only  ^W^W^W

ahem: http://www.daemon-systems.org/man/preadv.2.html

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 15:24                   ` Gerd Hoffmann
@ 2008-12-11 15:53                     ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 15:53 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Anthony Liguori, kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 04:24:37PM +0100, Gerd Hoffmann wrote:
> Well, linux kernel aio has its share of problems too:
> 
>   * Anthony mentioned it may block on certain circumstances (forgot
>     which ones), and you can't figure beforehand to turn off aio then.

We've worse problems as long as bdrv_read/write are used by qcow2. And
we can fix host kernel in the long run if this becomes an issue.

>   * It can't handle block allocation.  Kernel handles that by doing
>     such writes synchronously via VFS layer (instead of the separate
>     aio code paths).  Leads to horrible performance and bug reports
>     such as "installs on sparse files are very slow".

I think here you mean O_DIRECT regardless of aio/sync API, I doubt aio
has any relevance to block allocation in any way, so whatever problem
we have with kernel API and O_DIRECT should also be there with
sync-api + userland threads and O_DIRECT.

>   * support for vectored aio isn't that old.  IIRC it was added
>     somewhen around 2.6.20 (newer that current suse/redhat enterprise
>     versions).  Which IMHO means you can't expect it being present
>     unconditionally.

I think this is a false alarm: the whole point of kernel AIO is that
even if O_DIRECT is enabled, all bios are pushed to the disk before
the disk queue is unplugged which is all we care about to get decent
disk bandwidth with zerocopy dma. Or at least that's the way it's
supposed to work if aio is implemented correctly at the bio level.

So in kernels that don't support IOCB_CMD_READV/WRITEV, we've simply
to an array of iocb through io_submit (i.e. to conver the iov into a
vector of iocb, instead of a single iocb pointing to the
iov). Internally to io_submit a single dma command should be generated
and the same sg list should be built the same as if we used
READV/WRITEV. In theory READV/WRITEV should be just a cpu saving
feature, it shouldn't influence disk bandwidth. If it does, it means
the bio layer is broken and needs fixing.

If IOCB_CMD_READV/WRITEV is available, good, if not we go with
READ/WRITE and more iocb dynamically allocated. It just needs a
conversion routine from iovec, file, offset to iocb pointer when
IOCB_CMD_READV/WRITEV is not available. The iocb array can be
preallocated along with the iovec when we detect IOCB_CMD_READV/WRITEV
is not available, I've a cache layer that does this and I'll just
provide an output selectable in iovec or iocb terms, with iocb
selectable depending if host os is linux and IOCB_CMD_READV/WRITEV is
not available.

> Threads will be there anyway for kvm smp.

Yes, I didn't mean those threads ;), I love threads, but I love
threads that are CPU bound and allow to exploit the whole power of the
system! But for storage, threads are purely overscheduling overhead as
far as I can tell, given we've an async api to use and we already have
to deal with the pain of async programming. So it worth we get the
full benefit of it (i.e. no thread/overscheduling overhead).

If aio inside the kernel is too complex than use kernel threads, it's
still better than user threads.

I mean if we keep only using threads we should get rid of bdrv_aio*
completely and move qcow2 code in a separate thread instead of keep
running it from the io thread. If we stick to threads then it worth to
get the full benefit of threads (i.e. not having to deal with the
pains of async programming and moving the qcow2 computation in a
separate CPU). Something I tried doing but I ended up having to add
locks all over qcow2 in order to submit multiple qcow2 requests in
parallel (otherwise the lock would be global and I couldn't
differentiate between a bdrv_read for qcow2 metadata that must be
executed with the qcow2 mutex held, and a bdrv_aio_readv that can run
lockless from the point of view of the current qcow2 instance - the
qcow2 parent may take its own locks then etc..). Basically it breaks
all backends something I'm not confortable with right now just to get
zerocopy dma working at platter speed. Hence I stick with async
programming for now...

> Well, wait for glibc isn't going to fly.  glibc waits for posix, and
> posix waits for a reference implementation (which will not be glibc).

Agree.

> > and kernel with preadv/pwritev
> 
> With that in place you don't need kernel aio any more, then you can
> really do it in userspace with threads.  But that probably would be
> linux-only  ^W^W^W

Waiting for preadv/pwritev is just the 'quicker' version of waiting
glibc aio_readv. And because it remains a linux-only, I prefer kernel
AIO that fixes cfq and should be the most optimal anyway (with or
without READV/WRITEV support).

So in the end: we either open the file 64 times (which I think is
perfectly coherent with nfs unless the nfs client is broken, but then
Anthony may know nfs better, I'm not heavy nfs user here), or we go
with kernel AIO... you know my preference. Said that opening the file
64 times is probably simpler, if it has been confirmed that it doesn't
break nfs. Breaking nfs is not possible here, nfs is the ideal shared
storage for migration (we surely want to exploit the fact we need so
weak semantics we need to do a safe migration, that it worth to keep
nfs supported as 100% KVM reliable virtualization shared storage).

  > > ahem: http://www.daemon-systems.org/man/preadv.2.html > >

Too bad nobody implemented it yet...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 15:53                     ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 15:53 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 04:24:37PM +0100, Gerd Hoffmann wrote:
> Well, linux kernel aio has its share of problems too:
> 
>   * Anthony mentioned it may block on certain circumstances (forgot
>     which ones), and you can't figure beforehand to turn off aio then.

We've worse problems as long as bdrv_read/write are used by qcow2. And
we can fix host kernel in the long run if this becomes an issue.

>   * It can't handle block allocation.  Kernel handles that by doing
>     such writes synchronously via VFS layer (instead of the separate
>     aio code paths).  Leads to horrible performance and bug reports
>     such as "installs on sparse files are very slow".

I think here you mean O_DIRECT regardless of aio/sync API, I doubt aio
has any relevance to block allocation in any way, so whatever problem
we have with kernel API and O_DIRECT should also be there with
sync-api + userland threads and O_DIRECT.

>   * support for vectored aio isn't that old.  IIRC it was added
>     somewhen around 2.6.20 (newer that current suse/redhat enterprise
>     versions).  Which IMHO means you can't expect it being present
>     unconditionally.

I think this is a false alarm: the whole point of kernel AIO is that
even if O_DIRECT is enabled, all bios are pushed to the disk before
the disk queue is unplugged which is all we care about to get decent
disk bandwidth with zerocopy dma. Or at least that's the way it's
supposed to work if aio is implemented correctly at the bio level.

So in kernels that don't support IOCB_CMD_READV/WRITEV, we've simply
to an array of iocb through io_submit (i.e. to conver the iov into a
vector of iocb, instead of a single iocb pointing to the
iov). Internally to io_submit a single dma command should be generated
and the same sg list should be built the same as if we used
READV/WRITEV. In theory READV/WRITEV should be just a cpu saving
feature, it shouldn't influence disk bandwidth. If it does, it means
the bio layer is broken and needs fixing.

If IOCB_CMD_READV/WRITEV is available, good, if not we go with
READ/WRITE and more iocb dynamically allocated. It just needs a
conversion routine from iovec, file, offset to iocb pointer when
IOCB_CMD_READV/WRITEV is not available. The iocb array can be
preallocated along with the iovec when we detect IOCB_CMD_READV/WRITEV
is not available, I've a cache layer that does this and I'll just
provide an output selectable in iovec or iocb terms, with iocb
selectable depending if host os is linux and IOCB_CMD_READV/WRITEV is
not available.

> Threads will be there anyway for kvm smp.

Yes, I didn't mean those threads ;), I love threads, but I love
threads that are CPU bound and allow to exploit the whole power of the
system! But for storage, threads are purely overscheduling overhead as
far as I can tell, given we've an async api to use and we already have
to deal with the pain of async programming. So it worth we get the
full benefit of it (i.e. no thread/overscheduling overhead).

If aio inside the kernel is too complex than use kernel threads, it's
still better than user threads.

I mean if we keep only using threads we should get rid of bdrv_aio*
completely and move qcow2 code in a separate thread instead of keep
running it from the io thread. If we stick to threads then it worth to
get the full benefit of threads (i.e. not having to deal with the
pains of async programming and moving the qcow2 computation in a
separate CPU). Something I tried doing but I ended up having to add
locks all over qcow2 in order to submit multiple qcow2 requests in
parallel (otherwise the lock would be global and I couldn't
differentiate between a bdrv_read for qcow2 metadata that must be
executed with the qcow2 mutex held, and a bdrv_aio_readv that can run
lockless from the point of view of the current qcow2 instance - the
qcow2 parent may take its own locks then etc..). Basically it breaks
all backends something I'm not confortable with right now just to get
zerocopy dma working at platter speed. Hence I stick with async
programming for now...

> Well, wait for glibc isn't going to fly.  glibc waits for posix, and
> posix waits for a reference implementation (which will not be glibc).

Agree.

> > and kernel with preadv/pwritev
> 
> With that in place you don't need kernel aio any more, then you can
> really do it in userspace with threads.  But that probably would be
> linux-only  ^W^W^W

Waiting for preadv/pwritev is just the 'quicker' version of waiting
glibc aio_readv. And because it remains a linux-only, I prefer kernel
AIO that fixes cfq and should be the most optimal anyway (with or
without READV/WRITEV support).

So in the end: we either open the file 64 times (which I think is
perfectly coherent with nfs unless the nfs client is broken, but then
Anthony may know nfs better, I'm not heavy nfs user here), or we go
with kernel AIO... you know my preference. Said that opening the file
64 times is probably simpler, if it has been confirmed that it doesn't
break nfs. Breaking nfs is not possible here, nfs is the ideal shared
storage for migration (we surely want to exploit the fact we need so
weak semantics we need to do a safe migration, that it worth to keep
nfs supported as 100% KVM reliable virtualization shared storage).

  > > ahem: http://www.daemon-systems.org/man/preadv.2.html > >

Too bad nobody implemented it yet...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 15:53                     ` Andrea Arcangeli
@ 2008-12-11 16:11                       ` Gerd Hoffmann
  -1 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 16:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, kvm-devel, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1260 bytes --]

Andrea Arcangeli wrote:
>>   * It can't handle block allocation.  Kernel handles that by doing
>>     such writes synchronously via VFS layer (instead of the separate
>>     aio code paths).  Leads to horrible performance and bug reports
>>     such as "installs on sparse files are very slow".
> 
> I think here you mean O_DIRECT regardless of aio/sync API,

Yes.  But kernel aio requires O_DIRECT, so aio users are affected
nevertheless.

> So in kernels that don't support IOCB_CMD_READV/WRITEV, we've simply
> to an array of iocb through io_submit (i.e. to conver the iov into a
> vector of iocb, instead of a single iocb pointing to the
> iov). Internally to io_submit a single dma command should be generated
> and the same sg list should be built the same as if we used
> READV/WRITEV. In theory READV/WRITEV should be just a cpu saving
> feature, it shouldn't influence disk bandwidth. If it does, it means
> the bio layer is broken and needs fixing.

Havn't tested that.  Could be it isn't a big problem, extra code size
for the two modes aside.

>   > > ahem: http://www.daemon-systems.org/man/preadv.2.html > >
> 
> Too bad nobody implemented it yet...

Kernel side looks easy, attached patch + syscall table windup in all
archs ...

cheers,
  Gerd

[-- Attachment #2: preadv.diff --]
[-- Type: text/plain, Size: 1390 bytes --]

diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9..d1ea2fd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -701,6 +701,54 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
+asmlinkage ssize_t sys_preadv(unsigned int fd, const struct iovec __user *vec,
+                              unsigned long vlen, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+asmlinkage ssize_t sys_pwritev(unsigned int fd, const struct iovec __user *vec,
+                              unsigned long vlen, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 16:11                       ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 16:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1260 bytes --]

Andrea Arcangeli wrote:
>>   * It can't handle block allocation.  Kernel handles that by doing
>>     such writes synchronously via VFS layer (instead of the separate
>>     aio code paths).  Leads to horrible performance and bug reports
>>     such as "installs on sparse files are very slow".
> 
> I think here you mean O_DIRECT regardless of aio/sync API,

Yes.  But kernel aio requires O_DIRECT, so aio users are affected
nevertheless.

> So in kernels that don't support IOCB_CMD_READV/WRITEV, we've simply
> to an array of iocb through io_submit (i.e. to conver the iov into a
> vector of iocb, instead of a single iocb pointing to the
> iov). Internally to io_submit a single dma command should be generated
> and the same sg list should be built the same as if we used
> READV/WRITEV. In theory READV/WRITEV should be just a cpu saving
> feature, it shouldn't influence disk bandwidth. If it does, it means
> the bio layer is broken and needs fixing.

Havn't tested that.  Could be it isn't a big problem, extra code size
for the two modes aside.

>   > > ahem: http://www.daemon-systems.org/man/preadv.2.html > >
> 
> Too bad nobody implemented it yet...

Kernel side looks easy, attached patch + syscall table windup in all
archs ...

cheers,
  Gerd

[-- Attachment #2: preadv.diff --]
[-- Type: text/plain, Size: 1390 bytes --]

diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9..d1ea2fd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -701,6 +701,54 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
+asmlinkage ssize_t sys_preadv(unsigned int fd, const struct iovec __user *vec,
+                              unsigned long vlen, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+asmlinkage ssize_t sys_pwritev(unsigned int fd, const struct iovec __user *vec,
+                              unsigned long vlen, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 15:24                   ` Gerd Hoffmann
@ 2008-12-11 16:41                     ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-11 16:41 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Andrea Arcangeli, kvm-devel, qemu-devel

Gerd Hoffmann wrote:
> Andrea Arcangeli wrote:
>   
>> My current feeling is that this user thread aio thing will never
>> satisfy enterprise usage and kernel aio is mandatory in my view.
>>     
>
> Well, linux kernel aio has its share of problems too:
>
>   * Anthony mentioned it may block on certain circumstances (forgot
>     which ones), and you can't figure beforehand to turn off aio then.
>
>   * It can't handle block allocation.  Kernel handles that by doing
>     such writes synchronously via VFS layer (instead of the separate
>     aio code paths).  Leads to horrible performance and bug reports
>     such as "installs on sparse files are very slow".
>
>   * support for vectored aio isn't that old.  IIRC it was added
>     somewhen around 2.6.20 (newer that current suse/redhat enterprise
>     versions).  Which IMHO means you can't expect it being present
>     unconditionally.
>
>   
>> And we should concentrate on kernel aio and get rid
>> of threads when host OS is linux.
>>     
>
> Threads will be there anyway for kvm smp.
>
>   
>> Has anybody a patch implementing kernel aio that I can plug into the
>> dma zerocopy api? I'm not so sure clone aio is worth maintaining
>> inside qemu instead of evolving glibc
>>     
>
> Well, wait for glibc isn't going to fly.  glibc waits for posix, and
> posix waits for a reference implementation (which will not be glibc).
>
>   
>> and kernel with preadv/pwritev
>>     
>
> With that in place you don't need kernel aio any more, then you can
> really do it in userspace with threads.  But that probably would be
> linux-only  ^W^W^W
>   

linux-only is okay but we just need a relatively sane fall back.  There 
have been preadv/pwritev patches posted before, they just for some 
reason never were merged.

http://lwn.net/Articles/163603/

> ahem: http://www.daemon-systems.org/man/preadv.2.html
>   

Yeah, dunno if that's all BSDs or just NetBSD.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 16:41                     ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-11 16:41 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel

Gerd Hoffmann wrote:
> Andrea Arcangeli wrote:
>   
>> My current feeling is that this user thread aio thing will never
>> satisfy enterprise usage and kernel aio is mandatory in my view.
>>     
>
> Well, linux kernel aio has its share of problems too:
>
>   * Anthony mentioned it may block on certain circumstances (forgot
>     which ones), and you can't figure beforehand to turn off aio then.
>
>   * It can't handle block allocation.  Kernel handles that by doing
>     such writes synchronously via VFS layer (instead of the separate
>     aio code paths).  Leads to horrible performance and bug reports
>     such as "installs on sparse files are very slow".
>
>   * support for vectored aio isn't that old.  IIRC it was added
>     somewhen around 2.6.20 (newer that current suse/redhat enterprise
>     versions).  Which IMHO means you can't expect it being present
>     unconditionally.
>
>   
>> And we should concentrate on kernel aio and get rid
>> of threads when host OS is linux.
>>     
>
> Threads will be there anyway for kvm smp.
>
>   
>> Has anybody a patch implementing kernel aio that I can plug into the
>> dma zerocopy api? I'm not so sure clone aio is worth maintaining
>> inside qemu instead of evolving glibc
>>     
>
> Well, wait for glibc isn't going to fly.  glibc waits for posix, and
> posix waits for a reference implementation (which will not be glibc).
>
>   
>> and kernel with preadv/pwritev
>>     
>
> With that in place you don't need kernel aio any more, then you can
> really do it in userspace with threads.  But that probably would be
> linux-only  ^W^W^W
>   

linux-only is okay but we just need a relatively sane fall back.  There 
have been preadv/pwritev patches posted before, they just for some 
reason never were merged.

http://lwn.net/Articles/163603/

> ahem: http://www.daemon-systems.org/man/preadv.2.html
>   

Yeah, dunno if that's all BSDs or just NetBSD.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 16:11                       ` Gerd Hoffmann
@ 2008-12-11 16:49                         ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 16:49 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Anthony Liguori, kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> nevertheless.

Are you sure? It surely wasn't the case...

> Havn't tested that.  Could be it isn't a big problem, extra code size
> for the two modes aside.

There shouldn't be any problem.

> Kernel side looks easy, attached patch + syscall table windup in all
> archs ...

So should we depend on this?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 16:49                         ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 16:49 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> nevertheless.

Are you sure? It surely wasn't the case...

> Havn't tested that.  Could be it isn't a big problem, extra code size
> for the two modes aside.

There shouldn't be any problem.

> Kernel side looks easy, attached patch + syscall table windup in all
> archs ...

So should we depend on this?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 16:49                         ` Andrea Arcangeli
@ 2008-12-11 17:20                           ` Gerd Hoffmann
  -1 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 17:20 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
>> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
>> nevertheless.
> 
> Are you sure? It surely wasn't the case...

Tons of docs say so, but might be they are wrong, I didn't check.

>> Kernel side looks easy, attached patch + syscall table windup in all
>> archs ...
> 
> So should we depend on this?

I suspect we will end up with multiple implementations anyway.

So one could be preadv+threads.  Probably quite portable if we manage to
get the syscalls into linux kernel and glibc.  All *BSDs have it
already, for solaris I've found a feature request on that.  Dunno for MacOS.

Additionally we could have OS-specific bits such as linux-aio.  Maybe
also posix-aio for the *BSD family in case their kernel support for that
is better than what glibc provides (i.e. can handle multiple requests in
parallel without the fdpool hack).

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 17:20                           ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 17:20 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
>> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
>> nevertheless.
> 
> Are you sure? It surely wasn't the case...

Tons of docs say so, but might be they are wrong, I didn't check.

>> Kernel side looks easy, attached patch + syscall table windup in all
>> archs ...
> 
> So should we depend on this?

I suspect we will end up with multiple implementations anyway.

So one could be preadv+threads.  Probably quite portable if we manage to
get the syscalls into linux kernel and glibc.  All *BSDs have it
already, for solaris I've found a feature request on that.  Dunno for MacOS.

Additionally we could have OS-specific bits such as linux-aio.  Maybe
also posix-aio for the *BSD family in case their kernel support for that
is better than what glibc provides (i.e. can handle multiple requests in
parallel without the fdpool hack).

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 17:20                           ` Gerd Hoffmann
@ 2008-12-11 18:11                             ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 18:11 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Anthony Liguori, kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 06:20:09PM +0100, Gerd Hoffmann wrote:
> Andrea Arcangeli wrote:
> > On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> >> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> >> nevertheless.
> > 
> > Are you sure? It surely wasn't the case...
> 
> Tons of docs say so, but might be they are wrong, I didn't check.

I guess those tons of docs are just wrong then ;). I see no mention of
O_DIRECT in `man io_submit` at least... I seem to recall initially aio
only worked without O_DIRECT... ;). It's quite the opposite, O_DIRECT
works best with kernel aio, not the other way around. O_DIRECT
read/writes look very much like non-O_DIRECT seeking reads. For
seeking sync-reads kernel aio pays off as well as with O_DIRECT.

> So one could be preadv+threads.  Probably quite portable if we manage to
> get the syscalls into linux kernel and glibc.  All *BSDs have it
> already, for solaris I've found a feature request on that.  Dunno for MacOS.

Who's taking care of submitting it to linux?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 18:11                             ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-11 18:11 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm-devel, qemu-devel

On Thu, Dec 11, 2008 at 06:20:09PM +0100, Gerd Hoffmann wrote:
> Andrea Arcangeli wrote:
> > On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> >> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> >> nevertheless.
> > 
> > Are you sure? It surely wasn't the case...
> 
> Tons of docs say so, but might be they are wrong, I didn't check.

I guess those tons of docs are just wrong then ;). I see no mention of
O_DIRECT in `man io_submit` at least... I seem to recall initially aio
only worked without O_DIRECT... ;). It's quite the opposite, O_DIRECT
works best with kernel aio, not the other way around. O_DIRECT
read/writes look very much like non-O_DIRECT seeking reads. For
seeking sync-reads kernel aio pays off as well as with O_DIRECT.

> So one could be preadv+threads.  Probably quite portable if we manage to
> get the syscalls into linux kernel and glibc.  All *BSDs have it
> already, for solaris I've found a feature request on that.  Dunno for MacOS.

Who's taking care of submitting it to linux?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 18:11                             ` Andrea Arcangeli
@ 2008-12-11 20:38                               ` Gerd Hoffmann
  -1 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 20:38 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Anthony Liguori, kvm-devel, qemu-devel

  Hi,

>> So one could be preadv+threads.  Probably quite portable if we manage to
>> get the syscalls into linux kernel and glibc.  All *BSDs have it
>> already, for solaris I've found a feature request on that.  Dunno for MacOS.
> 
> Who's taking care of submitting it to linux?

I will.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-11 20:38                               ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-11 20:38 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, qemu-devel

  Hi,

>> So one could be preadv+threads.  Probably quite portable if we manage to
>> get the syscalls into linux kernel and glibc.  All *BSDs have it
>> already, for solaris I've found a feature request on that.  Dunno for MacOS.
> 
> Who's taking care of submitting it to linux?

I will.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 18:11                             ` Andrea Arcangeli
  (?)
  (?)
@ 2008-12-11 20:40                             ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-11 20:40 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 06:20:09PM +0100, Gerd Hoffmann wrote:
>   
>> Andrea Arcangeli wrote:
>>     
>>> On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
>>>       
>>>> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
>>>> nevertheless.
>>>>         
>>> Are you sure? It surely wasn't the case...
>>>       
>> Tons of docs say so, but might be they are wrong, I didn't check.
>>     
>
> I guess those tons of docs are just wrong then ;). I see no mention of
> O_DIRECT in `man io_submit` at least...

io_submit blocks unless you use O_DIRECT for most filesystems that 
people care about.

This is why the current trends are toward kernel space thread pools.  It 
would be very difficult to modify every file system to support true 
asynchronous buffered IO.

Also, it's pretty clear that linux-aio doesn't have a strong future so I 
don't think it's very worthwhile to support.

Regards,

Anthony Liguori

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 15:53                     ` Andrea Arcangeli
  (?)
  (?)
@ 2008-12-11 21:30                     ` Christoph Hellwig
  -1 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2008-12-11 21:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, kvm-devel

On Thu, Dec 11, 2008 at 04:53:35PM +0100, Andrea Arcangeli wrote:
> >   * It can't handle block allocation.  Kernel handles that by doing
> >     such writes synchronously via VFS layer (instead of the separate
> >     aio code paths).  Leads to horrible performance and bug reports
> >     such as "installs on sparse files are very slow".
> 
> I think here you mean O_DIRECT regardless of aio/sync API, I doubt aio
> has any relevance to block allocation in any way, so whatever problem
> we have with kernel API and O_DIRECT should also be there with
> sync-api + userland threads and O_DIRECT.

userland threads with O_DIRECT means only the thread doing block
allocation gets stalled, not any other.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 16:49                         ` Andrea Arcangeli
  (?)
  (?)
@ 2008-12-11 21:32                         ` Christoph Hellwig
  2008-12-12  0:27                             ` Andrea Arcangeli
  -1 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2008-12-11 21:32 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, kvm-devel

On Thu, Dec 11, 2008 at 05:49:47PM +0100, Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> > Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> > nevertheless.
> 
> Are you sure? It surely wasn't the case...

Mainline kernel aio only implements O_DIRECT.  Some RHEL version had
support for buffered kernel AIO.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 21:32                         ` Christoph Hellwig
@ 2008-12-12  0:27                             ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12  0:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel, Gerd Hoffmann, kvm-devel

On Thu, Dec 11, 2008 at 10:32:13PM +0100, Christoph Hellwig wrote:
> Mainline kernel aio only implements O_DIRECT.  Some RHEL version had
> support for buffered kernel AIO.

Not just RHEL, I'm sure SLES supported this too and it was quite
cleanly implemented, not sure why this wasn't picked up by mainline.

At the light of this preadv/pwritev or having the file opened by each
thread (if proven safe with nfs) remains the only option. Zerocopy dma
must be supported without O_DIRECT too even if it's probably more
important when O_DIRECT is on.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12  0:27                             ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12  0:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel, kvm-devel, Gerd Hoffmann

On Thu, Dec 11, 2008 at 10:32:13PM +0100, Christoph Hellwig wrote:
> Mainline kernel aio only implements O_DIRECT.  Some RHEL version had
> support for buffered kernel AIO.

Not just RHEL, I'm sure SLES supported this too and it was quite
cleanly implemented, not sure why this wasn't picked up by mainline.

At the light of this preadv/pwritev or having the file opened by each
thread (if proven safe with nfs) remains the only option. Zerocopy dma
must be supported without O_DIRECT too even if it's probably more
important when O_DIRECT is on.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-11 18:11                             ` Andrea Arcangeli
@ 2008-12-12  8:23                               ` Jens Axboe
  -1 siblings, 0 replies; 75+ messages in thread
From: Jens Axboe @ 2008-12-12  8:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, kvm-devel, aarcange

On Thu, Dec 11 2008, Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 06:20:09PM +0100, Gerd Hoffmann wrote:
> > Andrea Arcangeli wrote:
> > > On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> > >> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> > >> nevertheless.
> > > 
> > > Are you sure? It surely wasn't the case...
> > 
> > Tons of docs say so, but might be they are wrong, I didn't check.
> 
> I guess those tons of docs are just wrong then ;). I see no mention of
> O_DIRECT in `man io_submit` at least... I seem to recall initially aio
> only worked without O_DIRECT... ;). It's quite the opposite, O_DIRECT
> works best with kernel aio, not the other way around. O_DIRECT
> read/writes look very much like non-O_DIRECT seeking reads. For
> seeking sync-reads kernel aio pays off as well as with O_DIRECT.

aio is only async with O_DIRECT, with buffered IO it's sync.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12  8:23                               ` Jens Axboe
  0 siblings, 0 replies; 75+ messages in thread
From: Jens Axboe @ 2008-12-12  8:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, Gerd Hoffmann, kvm-devel

On Thu, Dec 11 2008, Andrea Arcangeli wrote:
> On Thu, Dec 11, 2008 at 06:20:09PM +0100, Gerd Hoffmann wrote:
> > Andrea Arcangeli wrote:
> > > On Thu, Dec 11, 2008 at 05:11:08PM +0100, Gerd Hoffmann wrote:
> > >> Yes.  But kernel aio requires O_DIRECT, so aio users are affected
> > >> nevertheless.
> > > 
> > > Are you sure? It surely wasn't the case...
> > 
> > Tons of docs say so, but might be they are wrong, I didn't check.
> 
> I guess those tons of docs are just wrong then ;). I see no mention of
> O_DIRECT in `man io_submit` at least... I seem to recall initially aio
> only worked without O_DIRECT... ;). It's quite the opposite, O_DIRECT
> works best with kernel aio, not the other way around. O_DIRECT
> read/writes look very much like non-O_DIRECT seeking reads. For
> seeking sync-reads kernel aio pays off as well as with O_DIRECT.

aio is only async with O_DIRECT, with buffered IO it's sync.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12  8:23                               ` Jens Axboe
@ 2008-12-12 11:51                                 ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 11:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: qemu-devel, Gerd Hoffmann, kvm-devel

On Fri, Dec 12, 2008 at 09:23:10AM +0100, Jens Axboe wrote:
> aio is only async with O_DIRECT, with buffered IO it's sync.

That's very bad, because aio is as needed for O_DIRECT as for buffered
read seeking workloads that otherwise have zero chance to fill the
I/O pipe... (well unless one is threading in userland which is obviously
less efficient than what aio could achieve in kernel space, even
without taking cfq screwed bucketing into account).

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 11:51                                 ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 11:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: qemu-devel, kvm-devel, Gerd Hoffmann

On Fri, Dec 12, 2008 at 09:23:10AM +0100, Jens Axboe wrote:
> aio is only async with O_DIRECT, with buffered IO it's sync.

That's very bad, because aio is as needed for O_DIRECT as for buffered
read seeking workloads that otherwise have zero chance to fill the
I/O pipe... (well unless one is threading in userland which is obviously
less efficient than what aio could achieve in kernel space, even
without taking cfq screwed bucketing into account).

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 11:51                                 ` Andrea Arcangeli
@ 2008-12-12 11:54                                   ` Jens Axboe
  -1 siblings, 0 replies; 75+ messages in thread
From: Jens Axboe @ 2008-12-12 11:54 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Jens Axboe, qemu-devel, Gerd Hoffmann, kvm-devel

On Fri, Dec 12 2008, Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 09:23:10AM +0100, Jens Axboe wrote:
> > aio is only async with O_DIRECT, with buffered IO it's sync.
> 
> That's very bad, because aio is as needed for O_DIRECT as for buffered
> read seeking workloads that otherwise have zero chance to fill the
> I/O pipe... (well unless one is threading in userland which is obviously
> less efficient than what aio could achieve in kernel space, even

I agree completely. The buffered aio patches got pretty involved though,
it wasn't real pretty in the end. So it never got merged. Looks like the
most realistic way forward is some variant of syslet (or the acall stuff
that Zach has been working on), which is largely a cop out and will
never perform as well.

> without taking cfq screwed bucketing into account).

I added CLONE_IO some time ago to avoid that, so it's perfectly possible
to share cfq io contexts with threads or processes even in userspace!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 11:54                                   ` Jens Axboe
  0 siblings, 0 replies; 75+ messages in thread
From: Jens Axboe @ 2008-12-12 11:54 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: qemu-devel, kvm-devel, Gerd Hoffmann

On Fri, Dec 12 2008, Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 09:23:10AM +0100, Jens Axboe wrote:
> > aio is only async with O_DIRECT, with buffered IO it's sync.
> 
> That's very bad, because aio is as needed for O_DIRECT as for buffered
> read seeking workloads that otherwise have zero chance to fill the
> I/O pipe... (well unless one is threading in userland which is obviously
> less efficient than what aio could achieve in kernel space, even

I agree completely. The buffered aio patches got pretty involved though,
it wasn't real pretty in the end. So it never got merged. Looks like the
most realistic way forward is some variant of syslet (or the acall stuff
that Zach has been working on), which is largely a cop out and will
never perform as well.

> without taking cfq screwed bucketing into account).

I added CLONE_IO some time ago to avoid that, so it's perfectly possible
to share cfq io contexts with threads or processes even in userspace!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 11:54                                   ` Jens Axboe
@ 2008-12-12 14:13                                     ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 14:13 UTC (permalink / raw)
  To: Jens Axboe; +Cc: qemu-devel, Gerd Hoffmann, kvm-devel

On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
> I agree completely. The buffered aio patches got pretty involved though,
> it wasn't real pretty in the end. So it never got merged. Looks like the
> most realistic way forward is some variant of syslet (or the acall stuff
> that Zach has been working on), which is largely a cop out and will
> never perform as well.

It'll at least perform better a brand new userland pool of threads for
each task that needs aio functionality, and it can be later optimized
if we want ;).

But I'm surprised, the aio patches in 2.4 were very clean, we didn't
have to break filesystems, it was really a nice done work, enterprise
quality as demonstrated by the several databases running on it for
years.  Ironically the O_DIRECT part didn't work at the
time... because effectively the O_DIRECT part is more difficult. So
2.6 has the hard stuff done and misses the simpler stuff. I guess the
simpler stuff is harder to merge as it has more users.

Well I hope it'll be fixed... for kvm/qemu we definitely require aio
for buffered reads too (buffered writes aren't a big deal but reads
are). For the parent images it makes sense to run them in buffered
mode even on servers using O_DIRECT, so basically we can't use
linux-aio until this is fixed somehow.

In the meantime I think it'd be better to -EINVAL (so the userland
thread can fallback to userland thread pool) instead of just behaving
synchronously that can break GUI and interactive behavior...

> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
> to share cfq io contexts with threads or processes even in userspace!

It's available in recent kernels I see! so the fix is easy. Only
problem is how to pass CLONE_IO to pthread_create... We'll have to
make a linux-only change and call clone by hand under some #ifdef
CLONE_IO.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 14:13                                     ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 14:13 UTC (permalink / raw)
  To: Jens Axboe; +Cc: qemu-devel, kvm-devel, Gerd Hoffmann

On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
> I agree completely. The buffered aio patches got pretty involved though,
> it wasn't real pretty in the end. So it never got merged. Looks like the
> most realistic way forward is some variant of syslet (or the acall stuff
> that Zach has been working on), which is largely a cop out and will
> never perform as well.

It'll at least perform better a brand new userland pool of threads for
each task that needs aio functionality, and it can be later optimized
if we want ;).

But I'm surprised, the aio patches in 2.4 were very clean, we didn't
have to break filesystems, it was really a nice done work, enterprise
quality as demonstrated by the several databases running on it for
years.  Ironically the O_DIRECT part didn't work at the
time... because effectively the O_DIRECT part is more difficult. So
2.6 has the hard stuff done and misses the simpler stuff. I guess the
simpler stuff is harder to merge as it has more users.

Well I hope it'll be fixed... for kvm/qemu we definitely require aio
for buffered reads too (buffered writes aren't a big deal but reads
are). For the parent images it makes sense to run them in buffered
mode even on servers using O_DIRECT, so basically we can't use
linux-aio until this is fixed somehow.

In the meantime I think it'd be better to -EINVAL (so the userland
thread can fallback to userland thread pool) instead of just behaving
synchronously that can break GUI and interactive behavior...

> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
> to share cfq io contexts with threads or processes even in userspace!

It's available in recent kernels I see! so the fix is easy. Only
problem is how to pass CLONE_IO to pthread_create... We'll have to
make a linux-only change and call clone by hand under some #ifdef
CLONE_IO.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 14:13                                     ` Andrea Arcangeli
@ 2008-12-12 14:24                                       ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 14:24 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jens Axboe, qemu-devel, Gerd Hoffmann, kvm-devel, Chris Wright

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
>   
> In the meantime I think it'd be better to -EINVAL (so the userland
> thread can fallback to userland thread pool) instead of just behaving
> synchronously that can break GUI and interactive behavior...
>   

Yes, IMHO, this is the major problem with the current state of 
linux-aio.  I really don't want to expose an aio= flag for users to 
configure.  I'd rather just do the Right Thing automatically.

>> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
>> to share cfq io contexts with threads or processes even in userspace!
>>     
>
> It's available in recent kernels I see! so the fix is easy. Only
> problem is how to pass CLONE_IO to pthread_create... We'll have to
> make a linux-only change and call clone by hand under some #ifdef
> CLONE_IO.
>   

I have no problem with this and I believe Chris was going to attempt an 
implementation.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 14:24                                       ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 14:24 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Chris Wright, qemu-devel, kvm-devel, Gerd Hoffmann

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
>   
> In the meantime I think it'd be better to -EINVAL (so the userland
> thread can fallback to userland thread pool) instead of just behaving
> synchronously that can break GUI and interactive behavior...
>   

Yes, IMHO, this is the major problem with the current state of 
linux-aio.  I really don't want to expose an aio= flag for users to 
configure.  I'd rather just do the Right Thing automatically.

>> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
>> to share cfq io contexts with threads or processes even in userspace!
>>     
>
> It's available in recent kernels I see! so the fix is easy. Only
> problem is how to pass CLONE_IO to pthread_create... We'll have to
> make a linux-only change and call clone by hand under some #ifdef
> CLONE_IO.
>   

I have no problem with this and I believe Chris was going to attempt an 
implementation.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-10 19:08               ` Andrea Arcangeli
@ 2008-12-12 14:24                 ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 14:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

On Wed, Dec 10, 2008 at 08:08:10PM +0100, Andrea Arcangeli wrote:
> On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
> > But opening twice means that you lose coherency with NFS.
> 
> Not sure why. They're not running from different nfs clients. If this

I just got confirmation from Trond that from nfs point of view,
opening the file multiple times or duping it, is the same as I
expected (all it matters is that the inode is the same so the
pagecache radix tree is the same etc..). So opening the file each time
a thread starts, would provide a local f_pos and it would work fine on
older/current kernels too. In theory other nfs clients should also
behave. Not sure what's best, if to hack around the bdrv api and open
the file multiple times or wait for preadv/pwritev.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 14:24                 ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 14:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

On Wed, Dec 10, 2008 at 08:08:10PM +0100, Andrea Arcangeli wrote:
> On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
> > But opening twice means that you lose coherency with NFS.
> 
> Not sure why. They're not running from different nfs clients. If this

I just got confirmation from Trond that from nfs point of view,
opening the file multiple times or duping it, is the same as I
expected (all it matters is that the inode is the same so the
pagecache radix tree is the same etc..). So opening the file each time
a thread starts, would provide a local f_pos and it would work fine on
older/current kernels too. In theory other nfs clients should also
behave. Not sure what's best, if to hack around the bdrv api and open
the file multiple times or wait for preadv/pwritev.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 14:24                 ` Andrea Arcangeli
@ 2008-12-12 14:35                   ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 14:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

Andrea Arcangeli wrote:
> On Wed, Dec 10, 2008 at 08:08:10PM +0100, Andrea Arcangeli wrote:
>   
>> On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
>>     
>>> But opening twice means that you lose coherency with NFS.
>>>       
>> Not sure why. They're not running from different nfs clients. If this
>>     
>
> I just got confirmation from Trond that from nfs point of view,
> opening the file multiple times or duping it, is the same as I
> expected (all it matters is that the inode is the same so the
> pagecache radix tree is the same etc..). So opening the file each time
> a thread starts,

I've been thinking about this, the problems I see are:

1) It's impossible to accept a file descriptor for a block device 
(possibly not a problem)

2) You'd have to open all the file descriptors at once.  Otherwise, you 
get really strange behavior if the file gets deleted while the guest is 
running (for instance, with -snapshot).

Regards,

Anthony Liguori

>  would provide a local f_pos and it would work fine on
> older/current kernels too. In theory other nfs clients should also
> behave. Not sure what's best, if to hack around the bdrv api and open
> the file multiple times or wait for preadv/pwritev.
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 14:35                   ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 14:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Wed, Dec 10, 2008 at 08:08:10PM +0100, Andrea Arcangeli wrote:
>   
>> On Wed, Dec 10, 2008 at 12:50:17PM -0600, Anthony Liguori wrote:
>>     
>>> But opening twice means that you lose coherency with NFS.
>>>       
>> Not sure why. They're not running from different nfs clients. If this
>>     
>
> I just got confirmation from Trond that from nfs point of view,
> opening the file multiple times or duping it, is the same as I
> expected (all it matters is that the inode is the same so the
> pagecache radix tree is the same etc..). So opening the file each time
> a thread starts,

I've been thinking about this, the problems I see are:

1) It's impossible to accept a file descriptor for a block device 
(possibly not a problem)

2) You'd have to open all the file descriptors at once.  Otherwise, you 
get really strange behavior if the file gets deleted while the guest is 
running (for instance, with -snapshot).

Regards,

Anthony Liguori

>  would provide a local f_pos and it would work fine on
> older/current kernels too. In theory other nfs clients should also
> behave. Not sure what's best, if to hack around the bdrv api and open
> the file multiple times or wait for preadv/pwritev.
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 14:35                   ` Anthony Liguori
@ 2008-12-12 15:44                     ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 15:44 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

On Fri, Dec 12, 2008 at 08:35:57AM -0600, Anthony Liguori wrote:
> I've been thinking about this, the problems I see are:
>
> 1) It's impossible to accept a file descriptor for a block device (possibly 
> not a problem)

What do you mean with accept? You mean to accept a tcp connection? How
would a block device fd be related to accept(2)?

> 2) You'd have to open all the file descriptors at once.  Otherwise, you get 
> really strange behavior if the file gets deleted while the guest is running 
> (for instance, with -snapshot).

Definitely, that's what I meant with hack around the bdrv api... not
even close to nice but doable in theory. Only advantage is that it
runs on older kernels but with seeking I/O lseek has to run as well.

Now that linux-aio is out of the picture for quite a long time for us,
I guess it worth to wait preadv/pwritev and stick with that and
reconsider linux-aio after they fix it... Waiting Gerd to post a full
patch.

But it's your call... I'm fine either ways. Clearly the os missing
preadv/pwritev would need to be limited to 1 thread per fd (but 1
thread per fd kind of breaks with the current _global_ list so I guess
they'll be limited to just 1 thread otherwise it may be actually
simpler to just open the file multiple times than to have a per-fd
queue ;), not the end of the world for them.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 15:44                     ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 15:44 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

On Fri, Dec 12, 2008 at 08:35:57AM -0600, Anthony Liguori wrote:
> I've been thinking about this, the problems I see are:
>
> 1) It's impossible to accept a file descriptor for a block device (possibly 
> not a problem)

What do you mean with accept? You mean to accept a tcp connection? How
would a block device fd be related to accept(2)?

> 2) You'd have to open all the file descriptors at once.  Otherwise, you get 
> really strange behavior if the file gets deleted while the guest is running 
> (for instance, with -snapshot).

Definitely, that's what I meant with hack around the bdrv api... not
even close to nice but doable in theory. Only advantage is that it
runs on older kernels but with seeking I/O lseek has to run as well.

Now that linux-aio is out of the picture for quite a long time for us,
I guess it worth to wait preadv/pwritev and stick with that and
reconsider linux-aio after they fix it... Waiting Gerd to post a full
patch.

But it's your call... I'm fine either ways. Clearly the os missing
preadv/pwritev would need to be limited to 1 thread per fd (but 1
thread per fd kind of breaks with the current _global_ list so I guess
they'll be limited to just 1 thread otherwise it may be actually
simpler to just open the file multiple times than to have a per-fd
queue ;), not the end of the world for them.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 14:24                                       ` Anthony Liguori
@ 2008-12-12 16:33                                         ` Chris Wright
  -1 siblings, 0 replies; 75+ messages in thread
From: Chris Wright @ 2008-12-12 16:33 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, Jens Axboe, qemu-devel, Gerd Hoffmann,
	kvm-devel, Chris Wright

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> Andrea Arcangeli wrote:
>> On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
>>> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
>>> to share cfq io contexts with threads or processes even in userspace!

Yes, I've used it in this context and it solves the throughput issue we
see with cfq idling the independent threads.  Not doing vectored io
all the way through only exacerbates the problem, of course.

>> It's available in recent kernels I see! so the fix is easy. Only
>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>> make a linux-only change and call clone by hand under some #ifdef
>> CLONE_IO.
>
> I have no problem with this and I believe Chris was going to attempt an  
> implementation.

I have an implementation that is just raw clone and futex (so not portable
from linux/x86).  I'll post it later today.  We'll need glibc changes
down the road to properly handle.

thanks,
-chris

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 16:33                                         ` Chris Wright
  0 siblings, 0 replies; 75+ messages in thread
From: Chris Wright @ 2008-12-12 16:33 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andrea Arcangeli, Chris Wright, kvm-devel, qemu-devel, Gerd Hoffmann

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> Andrea Arcangeli wrote:
>> On Fri, Dec 12, 2008 at 12:54:21PM +0100, Jens Axboe wrote:
>>> I added CLONE_IO some time ago to avoid that, so it's perfectly possible
>>> to share cfq io contexts with threads or processes even in userspace!

Yes, I've used it in this context and it solves the throughput issue we
see with cfq idling the independent threads.  Not doing vectored io
all the way through only exacerbates the problem, of course.

>> It's available in recent kernels I see! so the fix is easy. Only
>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>> make a linux-only change and call clone by hand under some #ifdef
>> CLONE_IO.
>
> I have no problem with this and I believe Chris was going to attempt an  
> implementation.

I have an implementation that is just raw clone and futex (so not portable
from linux/x86).  I'll post it later today.  We'll need glibc changes
down the road to properly handle.

thanks,
-chris

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 15:44                     ` Andrea Arcangeli
@ 2008-12-12 16:49                       ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 16:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 08:35:57AM -0600, Anthony Liguori wrote:
>   
>> I've been thinking about this, the problems I see are:
>>
>> 1) It's impossible to accept a file descriptor for a block device (possibly 
>> not a problem)
>>     
>
> What do you mean with accept? You mean to accept a tcp connection? How
> would a block device fd be related to accept(2)?
>   

I meant, if you wanted to pass a file descriptor as a raw device.  So:

qemu -hda raw:fd=4

Or something like that.  We don't support this today.

> Now that linux-aio is out of the picture for quite a long time for us,
> I guess it worth to wait preadv/pwritev and stick with that and
> reconsider linux-aio after they fix it... Waiting Gerd to post a full
> patch.
>
> But it's your call... I'm fine either ways. Clearly the os missing
> preadv/pwritev would need to be limited to 1 thread per fd (but 1
> thread per fd kind of breaks with the current _global_ list so I guess
> they'll be limited to just 1 thread otherwise it may be actually
> simpler to just open the file multiple times than to have a per-fd
> queue ;), not the end of the world for them.
>   

I think bouncing the iov and just using pread/pwrite may be our best 
bet.  It means memory allocation but we can cap it.  Since we're using 
threads, we just can force a thread to sleep until memory becomes 
available so it's actually pretty straight forward.

We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
proper syscalls on newer kernels, on BSDs, and bounce everything else.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 16:49                       ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 16:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 08:35:57AM -0600, Anthony Liguori wrote:
>   
>> I've been thinking about this, the problems I see are:
>>
>> 1) It's impossible to accept a file descriptor for a block device (possibly 
>> not a problem)
>>     
>
> What do you mean with accept? You mean to accept a tcp connection? How
> would a block device fd be related to accept(2)?
>   

I meant, if you wanted to pass a file descriptor as a raw device.  So:

qemu -hda raw:fd=4

Or something like that.  We don't support this today.

> Now that linux-aio is out of the picture for quite a long time for us,
> I guess it worth to wait preadv/pwritev and stick with that and
> reconsider linux-aio after they fix it... Waiting Gerd to post a full
> patch.
>
> But it's your call... I'm fine either ways. Clearly the os missing
> preadv/pwritev would need to be limited to 1 thread per fd (but 1
> thread per fd kind of breaks with the current _global_ list so I guess
> they'll be limited to just 1 thread otherwise it may be actually
> simpler to just open the file multiple times than to have a per-fd
> queue ;), not the end of the world for them.
>   

I think bouncing the iov and just using pread/pwrite may be our best 
bet.  It means memory allocation but we can cap it.  Since we're using 
threads, we just can force a thread to sleep until memory becomes 
available so it's actually pretty straight forward.

We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
proper syscalls on newer kernels, on BSDs, and bounce everything else.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 16:33                                         ` Chris Wright
@ 2008-12-12 16:51                                           ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 16:51 UTC (permalink / raw)
  To: Chris Wright
  Cc: Andrea Arcangeli, Jens Axboe, qemu-devel, Gerd Hoffmann, kvm-devel

Chris Wright wrote:
> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>   
>>> It's available in recent kernels I see! so the fix is easy. Only
>>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>>> make a linux-only change and call clone by hand under some #ifdef
>>> CLONE_IO.
>>>       
>> I have no problem with this and I believe Chris was going to attempt an  
>> implementation.
>>     
>
> I have an implementation that is just raw clone and futex (so not portable
> from linux/x86).  I'll post it later today.  We'll need glibc changes
> down the road to properly handle.
>   

I just committed the posix-aio switch to thread pool so it would be 
great if you could base on that.

Regards,

Anthony Liguori

> thanks,
> -chris
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 16:51                                           ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 16:51 UTC (permalink / raw)
  To: Chris Wright; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel, Gerd Hoffmann

Chris Wright wrote:
> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>   
>>> It's available in recent kernels I see! so the fix is easy. Only
>>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>>> make a linux-only change and call clone by hand under some #ifdef
>>> CLONE_IO.
>>>       
>> I have no problem with this and I believe Chris was going to attempt an  
>> implementation.
>>     
>
> I have an implementation that is just raw clone and futex (so not portable
> from linux/x86).  I'll post it later today.  We'll need glibc changes
> down the road to properly handle.
>   

I just committed the posix-aio switch to thread pool so it would be 
great if you could base on that.

Regards,

Anthony Liguori

> thanks,
> -chris
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 16:51                                           ` Anthony Liguori
@ 2008-12-12 16:52                                             ` Chris Wright
  -1 siblings, 0 replies; 75+ messages in thread
From: Chris Wright @ 2008-12-12 16:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Andrea Arcangeli, Jens Axboe, qemu-devel,
	Gerd Hoffmann, kvm-devel

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> Chris Wright wrote:
>> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>>   
>>>> It's available in recent kernels I see! so the fix is easy. Only
>>>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>>>> make a linux-only change and call clone by hand under some #ifdef
>>>> CLONE_IO.
>>>>       
>>> I have no problem with this and I believe Chris was going to attempt 
>>> an  implementation.
>>>     
>>
>> I have an implementation that is just raw clone and futex (so not portable
>> from linux/x86).  I'll post it later today.  We'll need glibc changes
>> down the road to properly handle.
>
> I just committed the posix-aio switch to thread pool so it would be  
> great if you could base on that.

That's what I've been basing off of.

thanks,
-chris

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 16:52                                             ` Chris Wright
  0 siblings, 0 replies; 75+ messages in thread
From: Chris Wright @ 2008-12-12 16:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Andrea Arcangeli, kvm-devel, qemu-devel, Gerd Hoffmann

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> Chris Wright wrote:
>> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>>   
>>>> It's available in recent kernels I see! so the fix is easy. Only
>>>> problem is how to pass CLONE_IO to pthread_create... We'll have to
>>>> make a linux-only change and call clone by hand under some #ifdef
>>>> CLONE_IO.
>>>>       
>>> I have no problem with this and I believe Chris was going to attempt 
>>> an  implementation.
>>>     
>>
>> I have an implementation that is just raw clone and futex (so not portable
>> from linux/x86).  I'll post it later today.  We'll need glibc changes
>> down the road to properly handle.
>
> I just committed the posix-aio switch to thread pool so it would be  
> great if you could base on that.

That's what I've been basing off of.

thanks,
-chris

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 16:49                       ` Anthony Liguori
@ 2008-12-12 17:09                         ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 17:09 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

On Fri, Dec 12, 2008 at 10:49:45AM -0600, Anthony Liguori wrote:
> I meant, if you wanted to pass a file descriptor as a raw device.  So:
>
> qemu -hda raw:fd=4
>
> Or something like that.  We don't support this today.

ah ok.

> I think bouncing the iov and just using pread/pwrite may be our best bet.  
> It means memory allocation but we can cap it.  Since we're using threads, 

It's already capped. However currently it generates an iovec, but
we've simply to check the iovcnt to be 1, if it's 1 we pread from
iov.iov_base, iov.iov_len. The dma api will take care to enforce
iovcnt to be 1 for the iovec if preadv/pwritev isn't detected at
compile time.

> we just can force a thread to sleep until memory becomes available so it's 
> actually pretty straight forward.

There's no way to detect that and wait for memory, it'd sigkill before
you can check... at least with the default overcommit. The way the dma
api works, is that it doesn't send a mega large writev, but send it in
pieces capped by the max buffer size, with many iovecs with iovcnt = 1.

> We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
> proper syscalls on newer kernels, on BSDs, and bounce everything else.

Given READV/WRITEV aren't available in not very recent kernels and
given that without O_DIRECT each iocb will become synchronous, we
can't use the libaio. Also once they fix linux-aio, if we do that, the
iocb logic would need to be largely refactored. So I'm not sure if it
worth it as it can't handle 2.6.16-18 when O_DIRECT is disabled (when
O_DIRECT is enabled we could just build an array of linear iocb).

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 17:09                         ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 17:09 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

On Fri, Dec 12, 2008 at 10:49:45AM -0600, Anthony Liguori wrote:
> I meant, if you wanted to pass a file descriptor as a raw device.  So:
>
> qemu -hda raw:fd=4
>
> Or something like that.  We don't support this today.

ah ok.

> I think bouncing the iov and just using pread/pwrite may be our best bet.  
> It means memory allocation but we can cap it.  Since we're using threads, 

It's already capped. However currently it generates an iovec, but
we've simply to check the iovcnt to be 1, if it's 1 we pread from
iov.iov_base, iov.iov_len. The dma api will take care to enforce
iovcnt to be 1 for the iovec if preadv/pwritev isn't detected at
compile time.

> we just can force a thread to sleep until memory becomes available so it's 
> actually pretty straight forward.

There's no way to detect that and wait for memory, it'd sigkill before
you can check... at least with the default overcommit. The way the dma
api works, is that it doesn't send a mega large writev, but send it in
pieces capped by the max buffer size, with many iovecs with iovcnt = 1.

> We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
> proper syscalls on newer kernels, on BSDs, and bounce everything else.

Given READV/WRITEV aren't available in not very recent kernels and
given that without O_DIRECT each iocb will become synchronous, we
can't use the libaio. Also once they fix linux-aio, if we do that, the
iocb logic would need to be largely refactored. So I'm not sure if it
worth it as it can't handle 2.6.16-18 when O_DIRECT is disabled (when
O_DIRECT is enabled we could just build an array of linear iocb).

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 17:09                         ` Andrea Arcangeli
@ 2008-12-12 17:25                           ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 17:25 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 10:49:45AM -0600, Anthony Liguori wrote:
>   
>> I meant, if you wanted to pass a file descriptor as a raw device.  So:
>>
>> qemu -hda raw:fd=4
>>
>> Or something like that.  We don't support this today.
>>     
>
> ah ok.
>
>   
>> I think bouncing the iov and just using pread/pwrite may be our best bet.  
>> It means memory allocation but we can cap it.  Since we're using threads, 
>>     
>
> It's already capped. However currently it generates an iovec, but
> we've simply to check the iovcnt to be 1, if it's 1 we pread from
> iov.iov_base, iov.iov_len. The dma api will take care to enforce
> iovcnt to be 1 for the iovec if preadv/pwritev isn't detected at
> compile time.
>   

Hrm, that's more complex than I was expecting.  I was thinking the bdrv 
aio infrastructure would always take an iovec.  Any details about the 
underlying host's ability to handle the iovec would be insulated.

>> we just can force a thread to sleep until memory becomes available so it's 
>> actually pretty straight forward.
>>     
>
> There's no way to detect that and wait for memory,

If we artificially cap at say 50MB, then you do something like:

while (buffer == NULL) {
   buffer = try_to_bounce(offset, iov, iovcnt, &size);
   if (buffer == NULL && errno == ENOMEM) {
      pthread_wait_cond(more memory);
   }
}

try_to_bounce allocs with malloc() but if you exceed 50MB, then you fail 
with an error of ENOMEM.  In your bounce_free() function, you do a 
pthread_cond_broadcast() to wake up any threads potentially waiting to 
allocate memory.

This lets us expose a preadv/pwritev function that actually works.  The 
expectation is that bouncing will outperform just doing pread/pwrite of 
each vector.  Of course, you could get smart and if try_to_bounce fail, 
fall back to pread/pwrite each vector.  Likewise, you can fast-path the 
case of a single iovec to avoid bouncing entirely.

Regards,

Anthony Liguori

>  it'd sigkill before
> you can check... at least with the default overcommit. The way the dma
> api works, is that it doesn't send a mega large writev, but send it in
> pieces capped by the max buffer size, with many iovecs with iovcnt = 1.
>
>   
>> We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
>> proper syscalls on newer kernels, on BSDs, and bounce everything else.
>>     
>
> Given READV/WRITEV aren't available in not very recent kernels and
> given that without O_DIRECT each iocb will become synchronous, we
> can't use the libaio. Also once they fix linux-aio, if we do that, the
> iocb logic would need to be largely refactored. So I'm not sure if it
> worth it as it can't handle 2.6.16-18 when O_DIRECT is disabled (when
> O_DIRECT is enabled we could just build an array of linear iocb).
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 17:25                           ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 17:25 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 10:49:45AM -0600, Anthony Liguori wrote:
>   
>> I meant, if you wanted to pass a file descriptor as a raw device.  So:
>>
>> qemu -hda raw:fd=4
>>
>> Or something like that.  We don't support this today.
>>     
>
> ah ok.
>
>   
>> I think bouncing the iov and just using pread/pwrite may be our best bet.  
>> It means memory allocation but we can cap it.  Since we're using threads, 
>>     
>
> It's already capped. However currently it generates an iovec, but
> we've simply to check the iovcnt to be 1, if it's 1 we pread from
> iov.iov_base, iov.iov_len. The dma api will take care to enforce
> iovcnt to be 1 for the iovec if preadv/pwritev isn't detected at
> compile time.
>   

Hrm, that's more complex than I was expecting.  I was thinking the bdrv 
aio infrastructure would always take an iovec.  Any details about the 
underlying host's ability to handle the iovec would be insulated.

>> we just can force a thread to sleep until memory becomes available so it's 
>> actually pretty straight forward.
>>     
>
> There's no way to detect that and wait for memory,

If we artificially cap at say 50MB, then you do something like:

while (buffer == NULL) {
   buffer = try_to_bounce(offset, iov, iovcnt, &size);
   if (buffer == NULL && errno == ENOMEM) {
      pthread_wait_cond(more memory);
   }
}

try_to_bounce allocs with malloc() but if you exceed 50MB, then you fail 
with an error of ENOMEM.  In your bounce_free() function, you do a 
pthread_cond_broadcast() to wake up any threads potentially waiting to 
allocate memory.

This lets us expose a preadv/pwritev function that actually works.  The 
expectation is that bouncing will outperform just doing pread/pwrite of 
each vector.  Of course, you could get smart and if try_to_bounce fail, 
fall back to pread/pwrite each vector.  Likewise, you can fast-path the 
case of a single iovec to avoid bouncing entirely.

Regards,

Anthony Liguori

>  it'd sigkill before
> you can check... at least with the default overcommit. The way the dma
> api works, is that it doesn't send a mega large writev, but send it in
> pieces capped by the max buffer size, with many iovecs with iovcnt = 1.
>
>   
>> We can use libaio on older Linux's to simulate preadv/pwritev.  Use the 
>> proper syscalls on newer kernels, on BSDs, and bounce everything else.
>>     
>
> Given READV/WRITEV aren't available in not very recent kernels and
> given that without O_DIRECT each iocb will become synchronous, we
> can't use the libaio. Also once they fix linux-aio, if we do that, the
> iocb logic would need to be largely refactored. So I'm not sure if it
> worth it as it can't handle 2.6.16-18 when O_DIRECT is disabled (when
> O_DIRECT is enabled we could just build an array of linear iocb).
>   

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 17:25                           ` Anthony Liguori
@ 2008-12-12 17:52                             ` Andrea Arcangeli
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 17:52 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

On Fri, Dec 12, 2008 at 11:25:55AM -0600, Anthony Liguori wrote:
> Hrm, that's more complex than I was expecting.  I was thinking the bdrv aio 
> infrastructure would always take an iovec.  Any details about the 
> underlying host's ability to handle the iovec would be insulated.

You can't remove the restart memory-capped mechanism from the dma api,
we've to handle dma to non-ram that potentially requires to copy the
whole buffer so we're forced to have a safe linearization at the dma
api layer. So it's not necessary to reinvent the same
restart-partial-transfer logic in the aio layer too. Just set the
define and teach the aio logic to use pread/pwrite if iovcnt == 1 and
you're done ;).

So what I'm suggesting is simpler than what you were expecting, not
more complex. It would be more complex to replicate the restart-bounce
logic in the aio layer too.

Old drivers using bdrv_aio_read/write will keep working, new drivers
using dma api can also use bdrv_aio_readv/writev and the linearization
will happen inside the dma api if aio misses preadv/pwritev support.

> If we artificially cap at say 50MB, then you do something like:
>
> while (buffer == NULL) {
>   buffer = try_to_bounce(offset, iov, iovcnt, &size);
>   if (buffer == NULL && errno == ENOMEM) {
>      pthread_wait_cond(more memory);
>   }
> }

What I meant is that you'll never get ENOMEM. The task will be instant
killed during memcpy... To hope to get any meaningful behavior from
the above you'd need to set overcommit = 1, otherwise you just need
two qemu to alloc 50M at the same time and then memcpy at the same
time to get one of the two killed with -9.

> This lets us expose a preadv/pwritev function that actually works.  The 
> expectation is that bouncing will outperform just doing pread/pwrite of 
> each vector.  Of course, you could get smart and if try_to_bounce fail, 
> fall back to pread/pwrite each vector.  Likewise, you can fast-path the 
> case of a single iovec to avoid bouncing entirely.

Yes, pread/pwrite can't perform if O_DIRECT is enabled. If O_DIRECT is
disabled they could perform to remotely reasonable levels depending on
the host-exception cost vs memcpy cost, but we'd rather bounce to be
sure: testing the dma api with a bounce buffer of 512bytes (so
maximizing the number of syscalls because of the flood of restarts)
slowdown the I/O like a crawl even if buffering is enabled. The
syscall overhead is clearly very significant, basically memcpy is
faster for 512bytes here.

But just let the dma api do the iovec thing. If you want to provide an
abstraction that works also if the dma api decides to send down a
iovcnt > 1 then you could simply implement the fallback, but I think
it's not worth it, it should never happen that you get a iovcnt > 1
when preadv/pwritev aren't available. So you'd be writing code with
the only result that it could hide a performance bug -> not worth it.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 17:52                             ` Andrea Arcangeli
  0 siblings, 0 replies; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 17:52 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

On Fri, Dec 12, 2008 at 11:25:55AM -0600, Anthony Liguori wrote:
> Hrm, that's more complex than I was expecting.  I was thinking the bdrv aio 
> infrastructure would always take an iovec.  Any details about the 
> underlying host's ability to handle the iovec would be insulated.

You can't remove the restart memory-capped mechanism from the dma api,
we've to handle dma to non-ram that potentially requires to copy the
whole buffer so we're forced to have a safe linearization at the dma
api layer. So it's not necessary to reinvent the same
restart-partial-transfer logic in the aio layer too. Just set the
define and teach the aio logic to use pread/pwrite if iovcnt == 1 and
you're done ;).

So what I'm suggesting is simpler than what you were expecting, not
more complex. It would be more complex to replicate the restart-bounce
logic in the aio layer too.

Old drivers using bdrv_aio_read/write will keep working, new drivers
using dma api can also use bdrv_aio_readv/writev and the linearization
will happen inside the dma api if aio misses preadv/pwritev support.

> If we artificially cap at say 50MB, then you do something like:
>
> while (buffer == NULL) {
>   buffer = try_to_bounce(offset, iov, iovcnt, &size);
>   if (buffer == NULL && errno == ENOMEM) {
>      pthread_wait_cond(more memory);
>   }
> }

What I meant is that you'll never get ENOMEM. The task will be instant
killed during memcpy... To hope to get any meaningful behavior from
the above you'd need to set overcommit = 1, otherwise you just need
two qemu to alloc 50M at the same time and then memcpy at the same
time to get one of the two killed with -9.

> This lets us expose a preadv/pwritev function that actually works.  The 
> expectation is that bouncing will outperform just doing pread/pwrite of 
> each vector.  Of course, you could get smart and if try_to_bounce fail, 
> fall back to pread/pwrite each vector.  Likewise, you can fast-path the 
> case of a single iovec to avoid bouncing entirely.

Yes, pread/pwrite can't perform if O_DIRECT is enabled. If O_DIRECT is
disabled they could perform to remotely reasonable levels depending on
the host-exception cost vs memcpy cost, but we'd rather bounce to be
sure: testing the dma api with a bounce buffer of 512bytes (so
maximizing the number of syscalls because of the flood of restarts)
slowdown the I/O like a crawl even if buffering is enabled. The
syscall overhead is clearly very significant, basically memcpy is
faster for 512bytes here.

But just let the dma api do the iovec thing. If you want to provide an
abstraction that works also if the dma api decides to send down a
iovcnt > 1 then you could simply implement the fallback, but I think
it's not worth it, it should never happen that you get a iovcnt > 1
when preadv/pwritev aren't available. So you'd be writing code with
the only result that it could hide a performance bug -> not worth it.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 17:52                             ` Andrea Arcangeli
@ 2008-12-12 18:17                               ` Anthony Liguori
  -1 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 18:17 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, qemu-devel, kvm-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 11:25:55AM -0600, Anthony Liguori wrote:
>   
>> Hrm, that's more complex than I was expecting.  I was thinking the bdrv aio 
>> infrastructure would always take an iovec.  Any details about the 
>> underlying host's ability to handle the iovec would be insulated.
>>     
>
> You can't remove the restart memory-capped mechanism from the dma api,
> we've to handle dma to non-ram that potentially requires to copy the
> whole buffer so we're forced to have a safe linearization at the dma
> api layer. So it's not necessary to reinvent the same
> restart-partial-transfer logic in the aio layer too. Just set the
> define and teach the aio logic to use pread/pwrite if iovcnt == 1 and
> you're done ;).
>
> So what I'm suggesting is simpler than what you were expecting, not
> more complex. It would be more complex to replicate the restart-bounce
> logic in the aio layer too.
>
> Old drivers using bdrv_aio_read/write will keep working, new drivers
> using dma api can also use bdrv_aio_readv/writev and the linearization
> will happen inside the dma api if aio misses preadv/pwritev support.
>   

You assume that anything using bdrv_aio_readv/writev will be going 
through a DMA API.  This isn't a safe assumption.

Furthermore, you need some way to communicate the fact that you cannot 
handle iovcnt > 1 iovecs in a performant/safe way.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
@ 2008-12-12 18:17                               ` Anthony Liguori
  0 siblings, 0 replies; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 18:17 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Gerd Hoffmann, kvm-devel, qemu-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 11:25:55AM -0600, Anthony Liguori wrote:
>   
>> Hrm, that's more complex than I was expecting.  I was thinking the bdrv aio 
>> infrastructure would always take an iovec.  Any details about the 
>> underlying host's ability to handle the iovec would be insulated.
>>     
>
> You can't remove the restart memory-capped mechanism from the dma api,
> we've to handle dma to non-ram that potentially requires to copy the
> whole buffer so we're forced to have a safe linearization at the dma
> api layer. So it's not necessary to reinvent the same
> restart-partial-transfer logic in the aio layer too. Just set the
> define and teach the aio logic to use pread/pwrite if iovcnt == 1 and
> you're done ;).
>
> So what I'm suggesting is simpler than what you were expecting, not
> more complex. It would be more complex to replicate the restart-bounce
> logic in the aio layer too.
>
> Old drivers using bdrv_aio_read/write will keep working, new drivers
> using dma api can also use bdrv_aio_readv/writev and the linearization
> will happen inside the dma api if aio misses preadv/pwritev support.
>   

You assume that anything using bdrv_aio_readv/writev will be going 
through a DMA API.  This isn't a safe assumption.

Furthermore, you need some way to communicate the fact that you cannot 
handle iovcnt > 1 iovecs in a performant/safe way.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 18:17                               ` Anthony Liguori
  (?)
@ 2008-12-12 18:26                               ` Andrea Arcangeli
  2008-12-12 20:12                                 ` Gerd Hoffmann
  -1 siblings, 1 reply; 75+ messages in thread
From: Andrea Arcangeli @ 2008-12-12 18:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, kvm-devel

On Fri, Dec 12, 2008 at 12:17:58PM -0600, Anthony Liguori wrote:
> You assume that anything using bdrv_aio_readv/writev will be going through 
> a DMA API.  This isn't a safe assumption.

Well it's obviously a safe assumption right now... ;)

I've an hard time seeing any 'metadata' (because _data_ will be
guaranteed to always pass through the dma api if it wants to be
vectored) being physically contiguous on disk but not contiguous on
ram.

> Furthermore, you need some way to communicate the fact that you cannot 
> handle iovcnt > 1 iovecs in a performant/safe way.

assert(iovcnt == 1) will work fine. If somebody changes some backend
like qcow2 to call bdrv_aio_readv/writev they'll get an assert and
they'll ask for a compatibility layer. I don't see any urgency to add
it immediately and I think assert(iovcnt==1) is more preferable until
any user emerges asking for bdrv_aio_readv/writev as it'll function as
a debug trap to be sure all works right.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 18:26                               ` Andrea Arcangeli
@ 2008-12-12 20:12                                 ` Gerd Hoffmann
  2008-12-12 20:17                                   ` Anthony Liguori
  0 siblings, 1 reply; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-12 20:12 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: qemu-devel, kvm-devel

Andrea Arcangeli wrote:
> On Fri, Dec 12, 2008 at 12:17:58PM -0600, Anthony Liguori wrote:
>> You assume that anything using bdrv_aio_readv/writev will be going through 
>> a DMA API.  This isn't a safe assumption.
> 
> Well it's obviously a safe assumption right now... ;)

Well, the xen block backend in my xen patch queue calls bdrv_* directly
right now.  Dunno whenever it is possible/useful to fit the grant table
stuff into the upcoming dma api somehow.

cheers,
  Gerd


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 20:12                                 ` Gerd Hoffmann
@ 2008-12-12 20:17                                   ` Anthony Liguori
  2008-12-12 20:35                                     ` Gerd Hoffmann
  0 siblings, 1 reply; 75+ messages in thread
From: Anthony Liguori @ 2008-12-12 20:17 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel

Gerd Hoffmann wrote:
> Andrea Arcangeli wrote:
>   
>> On Fri, Dec 12, 2008 at 12:17:58PM -0600, Anthony Liguori wrote:
>>     
>>> You assume that anything using bdrv_aio_readv/writev will be going through 
>>> a DMA API.  This isn't a safe assumption.
>>>       
>> Well it's obviously a safe assumption right now... ;)
>>     
>
> Well, the xen block backend in my xen patch queue calls bdrv_* directly
> right now.  Dunno whenever it is possible/useful to fit the grant table
> stuff into the upcoming dma api somehow.
>   

I don't know about grant table references b/c that's really foreign 
memory.  But this is a good argument against the DMA as it stands, b/c 
you may be handing foreign memory to bdrv_aio_readv/writev.

A big reason for the map/unmap lock/unlock abstraction though would be 
the qemu-dm map cache.  I think you could use it to pretty reasonably 
integrate the map cache which I know I've previously could never be done :-)

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-12 20:17                                   ` Anthony Liguori
@ 2008-12-12 20:35                                     ` Gerd Hoffmann
  0 siblings, 0 replies; 75+ messages in thread
From: Gerd Hoffmann @ 2008-12-12 20:35 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Andrea Arcangeli, qemu-devel, kvm-devel

Anthony Liguori wrote:
> I don't know about grant table references b/c that's really foreign
> memory.  But this is a good argument against the DMA as it stands, b/c
> you may be handing foreign memory to bdrv_aio_readv/writev.
> 
> A big reason for the map/unmap lock/unlock abstraction though would be
> the qemu-dm map cache.  I think you could use it to pretty reasonably
> integrate the map cache which I know I've previously could never be done
> :-)

I see we fully agree here.

/me mumbles something about funny mail crossings.
Maybe we should use irc next time to save some typing ;)

cheers,
  Gerd


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool
  2008-12-05 21:21 ` [Qemu-devel] " Anthony Liguori
                   ` (2 preceding siblings ...)
  (?)
@ 2008-12-17 14:44 ` Ian Jackson
  -1 siblings, 0 replies; 75+ messages in thread
From: Ian Jackson @ 2008-12-17 14:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm-devel

Anthony Liguori writes:
> This patch implements posix-aio using pthreads.  It immediately
> eliminates the need for fd pooling.

Well in principle I think I approve.  Having read the discussion I
think emulation of preadv/pwritev with pread/pwrite is probably fine.

However I did want to make one comment on this: I think it would be
valuable to try to settle on a good and stable design for the aio
functionality.

I've spent the last several weeks tracking down a mysterious failure
of qemu-dm in our testing systems which turned out to be due to a bug
in the RHEL 4.3 glibc.  Obviously my own workstation and test box are
much newer and better than that.  Last time I merged from qemu
upstream I also spent quite a time discovering a different glibc bug.

These kind of threading/aio/etc. features are often immature in older
libcs, some of which advertise them but have subtle bugs.  If we can
at least pick an approach and an implementation, rather than
constantly changing, we'll have a chance to find and work around those
bugs.

But having said that I think avoiding too much reliance on
glibc-implemented primitives (which often seem to be buggy in some
suble way) and doing it ourselves seems like a better approach.

Ian.

^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2008-12-17 14:44 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-12-05 21:21 [RFC] Replace posix-aio with custom thread pool Anthony Liguori
2008-12-05 21:21 ` [Qemu-devel] " Anthony Liguori
2008-12-06  9:03 ` Blue Swirl
2008-12-06 18:26   ` Jamie Lokier
2008-12-08 18:23   ` Anthony Liguori
2008-12-08 18:23     ` Anthony Liguori
2008-12-09 15:51 ` Gerd Hoffmann
2008-12-09 16:01   ` Anthony Liguori
2008-12-10 16:44     ` Andrea Arcangeli
2008-12-10 17:21       ` Anthony Liguori
2008-12-10 17:21         ` Anthony Liguori
2008-12-10 17:29         ` Gerd Hoffmann
2008-12-10 18:50           ` Anthony Liguori
2008-12-10 19:08             ` Andrea Arcangeli
2008-12-10 19:08               ` Andrea Arcangeli
2008-12-11 13:12               ` Andrea Arcangeli
2008-12-11 15:24                 ` Gerd Hoffmann
2008-12-11 15:24                   ` Gerd Hoffmann
2008-12-11 15:53                   ` Andrea Arcangeli
2008-12-11 15:53                     ` Andrea Arcangeli
2008-12-11 16:11                     ` Gerd Hoffmann
2008-12-11 16:11                       ` Gerd Hoffmann
2008-12-11 16:49                       ` Andrea Arcangeli
2008-12-11 16:49                         ` Andrea Arcangeli
2008-12-11 17:20                         ` Gerd Hoffmann
2008-12-11 17:20                           ` Gerd Hoffmann
2008-12-11 18:11                           ` Andrea Arcangeli
2008-12-11 18:11                             ` Andrea Arcangeli
2008-12-11 20:38                             ` Gerd Hoffmann
2008-12-11 20:38                               ` Gerd Hoffmann
2008-12-11 20:40                             ` Anthony Liguori
2008-12-12  8:23                             ` Jens Axboe
2008-12-12  8:23                               ` Jens Axboe
2008-12-12 11:51                               ` Andrea Arcangeli
2008-12-12 11:51                                 ` Andrea Arcangeli
2008-12-12 11:54                                 ` Jens Axboe
2008-12-12 11:54                                   ` Jens Axboe
2008-12-12 14:13                                   ` Andrea Arcangeli
2008-12-12 14:13                                     ` Andrea Arcangeli
2008-12-12 14:24                                     ` Anthony Liguori
2008-12-12 14:24                                       ` Anthony Liguori
2008-12-12 16:33                                       ` Chris Wright
2008-12-12 16:33                                         ` Chris Wright
2008-12-12 16:51                                         ` Anthony Liguori
2008-12-12 16:51                                           ` Anthony Liguori
2008-12-12 16:52                                           ` Chris Wright
2008-12-12 16:52                                             ` Chris Wright
2008-12-11 21:32                         ` Christoph Hellwig
2008-12-12  0:27                           ` Andrea Arcangeli
2008-12-12  0:27                             ` Andrea Arcangeli
2008-12-11 21:30                     ` Christoph Hellwig
2008-12-11 16:41                   ` Anthony Liguori
2008-12-11 16:41                     ` Anthony Liguori
2008-12-12 14:24               ` Andrea Arcangeli
2008-12-12 14:24                 ` Andrea Arcangeli
2008-12-12 14:35                 ` Anthony Liguori
2008-12-12 14:35                   ` Anthony Liguori
2008-12-12 15:44                   ` Andrea Arcangeli
2008-12-12 15:44                     ` Andrea Arcangeli
2008-12-12 16:49                     ` Anthony Liguori
2008-12-12 16:49                       ` Anthony Liguori
2008-12-12 17:09                       ` Andrea Arcangeli
2008-12-12 17:09                         ` Andrea Arcangeli
2008-12-12 17:25                         ` Anthony Liguori
2008-12-12 17:25                           ` Anthony Liguori
2008-12-12 17:52                           ` Andrea Arcangeli
2008-12-12 17:52                             ` Andrea Arcangeli
2008-12-12 18:17                             ` Anthony Liguori
2008-12-12 18:17                               ` Anthony Liguori
2008-12-12 18:26                               ` Andrea Arcangeli
2008-12-12 20:12                                 ` Gerd Hoffmann
2008-12-12 20:17                                   ` Anthony Liguori
2008-12-12 20:35                                     ` Gerd Hoffmann
2008-12-09 17:16   ` Avi Kivity
2008-12-17 14:44 ` Ian Jackson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.