From mboxrd@z Thu Jan 1 00:00:00 1970 From: Anthony Liguori Subject: [RFC] Replace posix-aio with custom thread pool Date: Fri, 5 Dec 2008 15:21:01 -0600 Message-ID: <1228512061-25398-1-git-send-email-aliguori@us.ibm.com> Cc: kvm-devel , Anthony Liguori To: qemu-devel@nongnu.org Return-path: Received: from e38.co.us.ibm.com ([32.97.110.159]:41777 "EHLO e38.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752372AbYLEVVM (ORCPT ); Fri, 5 Dec 2008 16:21:12 -0500 Received: from d03relay02.boulder.ibm.com (d03relay02.boulder.ibm.com [9.17.195.227]) by e38.co.us.ibm.com (8.13.1/8.13.1) with ESMTP id mB5LK5S8025512 for ; Fri, 5 Dec 2008 14:20:06 -0700 Received: from d03av02.boulder.ibm.com (d03av02.boulder.ibm.com [9.17.195.168]) by d03relay02.boulder.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id mB5LLBaK208238 for ; Fri, 5 Dec 2008 14:21:11 -0700 Received: from d03av02.boulder.ibm.com (loopback [127.0.0.1]) by d03av02.boulder.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id mB5LLAS1005684 for ; Fri, 5 Dec 2008 14:21:11 -0700 Sender: kvm-owner@vger.kernel.org List-ID: glibc implements posix-aio as a thread pool and imposes a number of limitations. 1) it limits one request per-file descriptor. we hack around this by dup()'ing file descriptors which is hideously ugly 2) it's impossible to add new interfaces and we need a vectored read/write operation to properly support a zero-copy API. What has been suggested to me by glibc folks, is to implement whatever new interfaces we want and then it can eventually be proposed for standardization. This requires that we implement our own posix-aio implementation though. This patch implements posix-aio using pthreads. It immediately eliminates the need for fd pooling. It performs at least as well as the current posix-aio code (in some circumstances, even better). My only concern here is non-Linux Unices like FreeBSD. They have kernel support for posix-aio. Since we cannot extend those interfaces though, I think that even on those platforms we should still use a thread pool. Signed-off-by: Anthony Liguori diff --git a/Makefile b/Makefile index 76470a4..030a6ae 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o ifdef CONFIG_WIN32 BLOCK_OBJS += block-raw-win32.o else +ifdef CONFIG_AIO +BLOCK_OBJS += posix-aio-compat.o +endif BLOCK_OBJS += block-raw-posix.o endif diff --git a/Makefile.target b/Makefile.target index 671d72a..32dfb85 100644 --- a/Makefile.target +++ b/Makefile.target @@ -595,6 +595,9 @@ endif ifdef CONFIG_WIN32 OBJS+=block-raw-win32.o else +ifdef CONFIG_AIO +OBJS+=posix-aio-compat.o +endif OBJS+=block-raw-posix.o endif diff --git a/block-raw-posix.c b/block-raw-posix.c index 0a06a12..74b875a 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -27,7 +27,7 @@ #include "block_int.h" #include #ifdef CONFIG_AIO -#include +#include "posix-aio-compat.h" #endif #ifdef CONFIG_COCOA @@ -93,16 +93,10 @@ reopen it to see if the disk has been changed */ #define FD_OPEN_TIMEOUT 1000 -/* posix-aio doesn't allow multiple outstanding requests to a single file - * descriptor. we implement a pool of dup()'d file descriptors to work - * around this */ -#define RAW_FD_POOL_SIZE 64 - typedef struct BDRVRawState { int fd; int type; unsigned int lseek_err_cnt; - int fd_pool[RAW_FD_POOL_SIZE]; #if defined(__linux__) /* linux floppy specific */ int fd_open_flags; @@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; int fd, open_flags, ret; - int i; posix_aio_init(); @@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) return ret; } s->fd = fd; - for (i = 0; i < RAW_FD_POOL_SIZE; i++) - s->fd_pool[i] = -1; s->aligned_buf = NULL; if ((flags & BDRV_O_NOCACHE)) { s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); @@ -446,7 +437,6 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, typedef struct RawAIOCB { BlockDriverAIOCB common; - int fd; struct aiocb aiocb; struct RawAIOCB *next; int ret; @@ -458,38 +448,6 @@ typedef struct PosixAioState RawAIOCB *first_aio; } PosixAioState; -static int raw_fd_pool_get(BDRVRawState *s) -{ - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - /* already in use */ - if (s->fd_pool[i] != -1) - continue; - - /* try to dup file descriptor */ - s->fd_pool[i] = dup(s->fd); - if (s->fd_pool[i] != -1) - return s->fd_pool[i]; - } - - /* we couldn't dup the file descriptor so just use the main one */ - return s->fd; -} - -static void raw_fd_pool_put(RawAIOCB *acb) -{ - BDRVRawState *s = acb->common.bs->opaque; - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - if (s->fd_pool[i] == acb->fd) { - close(s->fd_pool[i]); - s->fd_pool[i] = -1; - } - } -} - static void posix_aio_read(void *opaque) { PosixAioState *s = opaque; @@ -519,7 +477,6 @@ static void posix_aio_read(void *opaque) if (ret == ECANCELED) { /* remove the request */ *pacb = acb->next; - raw_fd_pool_put(acb); qemu_aio_release(acb); } else if (ret != EINPROGRESS) { /* end of aio */ @@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque) *pacb = acb->next; /* call the callback */ acb->common.cb(acb->common.opaque, ret); - raw_fd_pool_put(acb); qemu_aio_release(acb); break; } else { @@ -571,6 +527,7 @@ static int posix_aio_init(void) struct sigaction act; PosixAioState *s; int fds[2]; + struct aioinit ai; if (posix_aio_state) return 0; @@ -598,24 +555,11 @@ static int posix_aio_init(void) qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); -#if defined(__linux__) - { - struct aioinit ai; + memset(&ai, 0, sizeof(ai)); + ai.aio_threads = 64; + ai.aio_num = 64; + aio_init(&ai); - memset(&ai, 0, sizeof(ai)); -#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4) - ai.aio_threads = 64; - ai.aio_num = 64; -#else - /* XXX: aio thread exit seems to hang on RedHat 9 and this init - seems to fix the problem. */ - ai.aio_threads = 1; - ai.aio_num = 1; - ai.aio_idle_time = 365 * 100000; -#endif - aio_init(&ai); - } -#endif posix_aio_state = s; return 0; @@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, acb = qemu_aio_get(bs, cb, opaque); if (!acb) return NULL; - acb->fd = raw_fd_pool_get(s); - acb->aiocb.aio_fildes = acb->fd; + acb->aiocb.aio_fildes = s->fd; acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; acb->aiocb.aio_buf = buf; @@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) break; } else if (*pacb == acb) { *pacb = acb->next; - raw_fd_pool_put(acb); qemu_aio_release(acb); break; } pacb = &acb->next; } } - #else /* CONFIG_AIO */ static int posix_aio_init(void) { @@ -753,17 +694,6 @@ static int posix_aio_init(void) } #endif /* CONFIG_AIO */ -static void raw_close_fd_pool(BDRVRawState *s) -{ - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - if (s->fd_pool[i] != -1) { - close(s->fd_pool[i]); - s->fd_pool[i] = -1; - } - } -} static void raw_close(BlockDriverState *bs) { @@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs) if (s->aligned_buf != NULL) qemu_free(s->aligned_buf); } - raw_close_fd_pool(s); } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -895,6 +824,7 @@ BlockDriver bdrv_raw = { .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif + .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, .bdrv_truncate = raw_truncate, @@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma static int hdev_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; - int fd, open_flags, ret, i; + int fd, open_flags, ret; posix_aio_init(); @@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) return ret; } s->fd = fd; - for (i = 0; i < RAW_FD_POOL_SIZE; i++) - s->fd_pool[i] = -1; #if defined(__linux__) /* close fd so that we can reopen it as needed */ if (s->type == FTYPE_FD) { @@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs) (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { close(s->fd); s->fd = -1; - raw_close_fd_pool(s); #ifdef DEBUG_FLOPPY printf("Floppy closed\n"); #endif @@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) if (s->fd >= 0) { close(s->fd); s->fd = -1; - raw_close_fd_pool(s); } fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); if (fd >= 0) { @@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = { .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif + .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, .bdrv_getlength = raw_getlength, diff --git a/configure b/configure index 7f82786..146b3fc 100755 --- a/configure +++ b/configure @@ -152,7 +152,6 @@ FreeBSD) bsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd pa" -aio_lib="-lpthread" if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then kqemu="yes" fi @@ -162,7 +161,6 @@ bsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd" oss_lib="-lossaudio" -aio_lib="-lrt -lpthread" ;; OpenBSD) bsd="yes" @@ -170,7 +168,6 @@ openbsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd" oss_lib="-lossaudio" -aio_lib="-lpthread" ;; Darwin) bsd="yes" @@ -181,7 +178,6 @@ audio_drv_list="coreaudio" audio_possible_drivers="coreaudio sdl fmod" OS_CFLAGS="-mdynamic-no-pic" OS_LDFLAGS="-framework CoreFoundation -framework IOKit" -aio_lib="-lpthread" ;; SunOS) solaris="yes" @@ -533,15 +529,6 @@ if test "$mingw32" = "yes" ; then bsd_user="no" fi -if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then - AIOLIBS= -elif [ "$bsd" = "yes" ]; then - AIOLIBS="$aio_lib" -else - # Some Linux architectures (e.g. s390) don't imply -lpthread automatically. - AIOLIBS="-lrt -lpthread" -fi - # Check for gcc4, error if pre-gcc4 if test "$check_gcc" = "yes" ; then cat > $TMPC < $TMPC << EOF -#include -int main(void) { return aio_write(NULL); } +#include +int main(void) { pthread_mutex_t lock; return 0; } EOF if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then aio=yes + AIOLIBS="-lpthread" fi fi diff --git a/posix-aio-compat.c b/posix-aio-compat.c new file mode 100644 index 0000000..c21d579 --- /dev/null +++ b/posix-aio-compat.c @@ -0,0 +1,202 @@ +/* + * QEMU posix-aio emulation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include "osdep.h" + +#include "posix-aio-compat.h" + +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +static pthread_t thread_id; +static int max_threads = 64; +static int cur_threads = 0; +static int idle_threads = 0; +static TAILQ_HEAD(, aiocb) request_list; + +static void *aio_thread(void *unused) +{ + sigset_t set; + + /* block all signals */ + sigfillset(&set); + sigprocmask(SIG_BLOCK, &set, NULL); + + while (1) { + struct aiocb *aiocb; + size_t offset; + int ret = 0; + + pthread_mutex_lock(&lock); + + while (TAILQ_EMPTY(&request_list) && + !(ret == ETIMEDOUT)) { + struct timespec ts = { 0 }; + qemu_timeval tv; + + qemu_gettimeofday(&tv); + ts.tv_sec = tv.tv_sec + 10; + ret = pthread_cond_timedwait(&cond, &lock, &ts); + } + + if (ret == ETIMEDOUT) + break; + + aiocb = TAILQ_FIRST(&request_list); + TAILQ_REMOVE(&request_list, aiocb, node); + + offset = 0; + aiocb->active = 1; + + idle_threads--; + pthread_mutex_unlock(&lock); + + while (offset < aiocb->aio_nbytes) { + ssize_t len; + + if (aiocb->is_write) + len = pwrite(aiocb->aio_fildes, + (const char *)aiocb->aio_buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + else + len = pread(aiocb->aio_fildes, + (char *)aiocb->aio_buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + + if (len == -1 && errno == EINTR) + continue; + else if (len == -1) { + pthread_mutex_lock(&lock); + aiocb->ret = -errno; + pthread_mutex_unlock(&lock); + break; + } else if (len == 0) + break; + + offset += len; + + pthread_mutex_lock(&lock); + aiocb->ret = offset; + pthread_mutex_unlock(&lock); + } + + pthread_mutex_lock(&lock); + idle_threads++; + pthread_mutex_unlock(&lock); + + sigqueue(getpid(), + aiocb->aio_sigevent.sigev_signo, + aiocb->aio_sigevent.sigev_value); + } + + idle_threads--; + cur_threads--; + pthread_mutex_unlock(&lock); + + return NULL; +} + +static int spawn_thread(void) +{ + pthread_attr_t attr; + int ret; + + cur_threads++; + idle_threads++; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + ret = pthread_create(&thread_id, &attr, aio_thread, NULL); + pthread_attr_destroy(&attr); + + return ret; +} + +int _compat_aio_init(struct aioinit *aioinit) +{ + TAILQ_INIT(&request_list); + + return 0; +} + +static int _compat_aio_submit(struct aiocb *aiocb, int is_write) +{ + aiocb->is_write = is_write; + aiocb->ret = -EINPROGRESS; + aiocb->active = 0; + pthread_mutex_lock(&lock); + if (idle_threads == 0 && cur_threads < max_threads) + spawn_thread(); + TAILQ_INSERT_TAIL(&request_list, aiocb, node); + pthread_mutex_unlock(&lock); + pthread_cond_broadcast(&cond); + + return 0; +} + +int _compat_aio_read(struct aiocb *aiocb) +{ + return _compat_aio_submit(aiocb, 0); +} + +int _compat_aio_write(struct aiocb *aiocb) +{ + return _compat_aio_submit(aiocb, 1); +} + +ssize_t _compat_aio_return(struct aiocb *aiocb) +{ + ssize_t ret; + + pthread_mutex_lock(&lock); + ret = aiocb->ret; + pthread_mutex_unlock(&lock); + + return ret; +} + +int _compat_aio_error(struct aiocb *aiocb) +{ + ssize_t ret = _compat_aio_return(aiocb); + + if (ret < 0) + ret = -ret; + else + ret = 0; + + return ret; +} + +int _compat_aio_cancel(int fd, struct aiocb *aiocb) +{ + int ret; + + pthread_mutex_lock(&lock); + if (!aiocb->active) { + TAILQ_REMOVE(&request_list, aiocb, node); + aiocb->ret = -ECANCELED; + ret = AIO_CANCELED; + } else if (aiocb->ret == -EINPROGRESS) + ret = AIO_NOTCANCELED; + else + ret = AIO_ALLDONE; + pthread_mutex_unlock(&lock); + + return ret; +} + diff --git a/posix-aio-compat.h b/posix-aio-compat.h new file mode 100644 index 0000000..c8fcb0e --- /dev/null +++ b/posix-aio-compat.h @@ -0,0 +1,86 @@ +/* + * QEMU posix-aio emulation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_POSIX_AIO_COMPAT_H +#define QEMU_POSIX_AIO_COMPAT_H + +#include +#include +#include + +#include "sys-queue.h" + +#define AIO_CANCELED 0x01 +#define AIO_NOTCANCELED 0x02 +#define AIO_ALLDONE 0x03 + +struct aiocb +{ + int aio_fildes; + void *aio_buf; + size_t aio_nbytes; + struct sigevent aio_sigevent; + off_t aio_offset; + + /* private */ + TAILQ_ENTRY(aiocb) node; + int is_write; + ssize_t ret; + int active; +}; + +struct aioinit +{ + int aio_threads; + int aio_num; + int aio_idle_time; +}; + +int _compat_aio_init(struct aioinit *aioinit); +int _compat_aio_read(struct aiocb *aiocb); +int _compat_aio_write(struct aiocb *aiocb); +int _compat_aio_error(struct aiocb *aiocb); +ssize_t _compat_aio_return(struct aiocb *aiocb); +int _compat_aio_cancel(int fd, struct aiocb *aiocb); + +static inline int aio_init(struct aioinit *aioinit) +{ + return _compat_aio_init(aioinit); +} + +static inline int aio_read(struct aiocb *aiocb) +{ + return _compat_aio_read(aiocb); +} + +static inline int aio_write(struct aiocb *aiocb) +{ + return _compat_aio_write(aiocb); +} + +static inline int aio_error(struct aiocb *aiocb) +{ + return _compat_aio_error(aiocb); +} + +static inline ssize_t aio_return(struct aiocb *aiocb) +{ + return _compat_aio_return(aiocb); +} + +static inline int aio_cancel(int fd, struct aiocb *aiocb) +{ + return _compat_aio_cancel(fd, aiocb); +} + +#endif From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1L8i6v-00078j-Jz for qemu-devel@nongnu.org; Fri, 05 Dec 2008 16:21:17 -0500 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1L8i6t-00075M-68 for qemu-devel@nongnu.org; Fri, 05 Dec 2008 16:21:17 -0500 Received: from [199.232.76.173] (port=44352 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1L8i6s-00075C-Ii for qemu-devel@nongnu.org; Fri, 05 Dec 2008 16:21:14 -0500 Received: from e36.co.us.ibm.com ([32.97.110.154]:40639) by monty-python.gnu.org with esmtps (TLS-1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.60) (envelope-from ) id 1L8i6r-0007p0-Un for qemu-devel@nongnu.org; Fri, 05 Dec 2008 16:21:14 -0500 Received: from d03relay04.boulder.ibm.com (d03relay04.boulder.ibm.com [9.17.195.106]) by e36.co.us.ibm.com (8.13.1/8.13.1) with ESMTP id mB5LKSHR021053 for ; Fri, 5 Dec 2008 14:20:28 -0700 Received: from d03av02.boulder.ibm.com (d03av02.boulder.ibm.com [9.17.195.168]) by d03relay04.boulder.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id mB5LLAE9226486 for ; Fri, 5 Dec 2008 14:21:10 -0700 Received: from d03av02.boulder.ibm.com (loopback [127.0.0.1]) by d03av02.boulder.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id mB5LLARv005684 for ; Fri, 5 Dec 2008 14:21:10 -0700 From: Anthony Liguori Date: Fri, 5 Dec 2008 15:21:01 -0600 Message-Id: <1228512061-25398-1-git-send-email-aliguori@us.ibm.com> Subject: [Qemu-devel] [RFC] Replace posix-aio with custom thread pool Reply-To: qemu-devel@nongnu.org List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Anthony Liguori , kvm-devel glibc implements posix-aio as a thread pool and imposes a number of limitations. 1) it limits one request per-file descriptor. we hack around this by dup()'ing file descriptors which is hideously ugly 2) it's impossible to add new interfaces and we need a vectored read/write operation to properly support a zero-copy API. What has been suggested to me by glibc folks, is to implement whatever new interfaces we want and then it can eventually be proposed for standardization. This requires that we implement our own posix-aio implementation though. This patch implements posix-aio using pthreads. It immediately eliminates the need for fd pooling. It performs at least as well as the current posix-aio code (in some circumstances, even better). My only concern here is non-Linux Unices like FreeBSD. They have kernel support for posix-aio. Since we cannot extend those interfaces though, I think that even on those platforms we should still use a thread pool. Signed-off-by: Anthony Liguori diff --git a/Makefile b/Makefile index 76470a4..030a6ae 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o ifdef CONFIG_WIN32 BLOCK_OBJS += block-raw-win32.o else +ifdef CONFIG_AIO +BLOCK_OBJS += posix-aio-compat.o +endif BLOCK_OBJS += block-raw-posix.o endif diff --git a/Makefile.target b/Makefile.target index 671d72a..32dfb85 100644 --- a/Makefile.target +++ b/Makefile.target @@ -595,6 +595,9 @@ endif ifdef CONFIG_WIN32 OBJS+=block-raw-win32.o else +ifdef CONFIG_AIO +OBJS+=posix-aio-compat.o +endif OBJS+=block-raw-posix.o endif diff --git a/block-raw-posix.c b/block-raw-posix.c index 0a06a12..74b875a 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -27,7 +27,7 @@ #include "block_int.h" #include #ifdef CONFIG_AIO -#include +#include "posix-aio-compat.h" #endif #ifdef CONFIG_COCOA @@ -93,16 +93,10 @@ reopen it to see if the disk has been changed */ #define FD_OPEN_TIMEOUT 1000 -/* posix-aio doesn't allow multiple outstanding requests to a single file - * descriptor. we implement a pool of dup()'d file descriptors to work - * around this */ -#define RAW_FD_POOL_SIZE 64 - typedef struct BDRVRawState { int fd; int type; unsigned int lseek_err_cnt; - int fd_pool[RAW_FD_POOL_SIZE]; #if defined(__linux__) /* linux floppy specific */ int fd_open_flags; @@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; int fd, open_flags, ret; - int i; posix_aio_init(); @@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) return ret; } s->fd = fd; - for (i = 0; i < RAW_FD_POOL_SIZE; i++) - s->fd_pool[i] = -1; s->aligned_buf = NULL; if ((flags & BDRV_O_NOCACHE)) { s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); @@ -446,7 +437,6 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, typedef struct RawAIOCB { BlockDriverAIOCB common; - int fd; struct aiocb aiocb; struct RawAIOCB *next; int ret; @@ -458,38 +448,6 @@ typedef struct PosixAioState RawAIOCB *first_aio; } PosixAioState; -static int raw_fd_pool_get(BDRVRawState *s) -{ - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - /* already in use */ - if (s->fd_pool[i] != -1) - continue; - - /* try to dup file descriptor */ - s->fd_pool[i] = dup(s->fd); - if (s->fd_pool[i] != -1) - return s->fd_pool[i]; - } - - /* we couldn't dup the file descriptor so just use the main one */ - return s->fd; -} - -static void raw_fd_pool_put(RawAIOCB *acb) -{ - BDRVRawState *s = acb->common.bs->opaque; - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - if (s->fd_pool[i] == acb->fd) { - close(s->fd_pool[i]); - s->fd_pool[i] = -1; - } - } -} - static void posix_aio_read(void *opaque) { PosixAioState *s = opaque; @@ -519,7 +477,6 @@ static void posix_aio_read(void *opaque) if (ret == ECANCELED) { /* remove the request */ *pacb = acb->next; - raw_fd_pool_put(acb); qemu_aio_release(acb); } else if (ret != EINPROGRESS) { /* end of aio */ @@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque) *pacb = acb->next; /* call the callback */ acb->common.cb(acb->common.opaque, ret); - raw_fd_pool_put(acb); qemu_aio_release(acb); break; } else { @@ -571,6 +527,7 @@ static int posix_aio_init(void) struct sigaction act; PosixAioState *s; int fds[2]; + struct aioinit ai; if (posix_aio_state) return 0; @@ -598,24 +555,11 @@ static int posix_aio_init(void) qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); -#if defined(__linux__) - { - struct aioinit ai; + memset(&ai, 0, sizeof(ai)); + ai.aio_threads = 64; + ai.aio_num = 64; + aio_init(&ai); - memset(&ai, 0, sizeof(ai)); -#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4) - ai.aio_threads = 64; - ai.aio_num = 64; -#else - /* XXX: aio thread exit seems to hang on RedHat 9 and this init - seems to fix the problem. */ - ai.aio_threads = 1; - ai.aio_num = 1; - ai.aio_idle_time = 365 * 100000; -#endif - aio_init(&ai); - } -#endif posix_aio_state = s; return 0; @@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, acb = qemu_aio_get(bs, cb, opaque); if (!acb) return NULL; - acb->fd = raw_fd_pool_get(s); - acb->aiocb.aio_fildes = acb->fd; + acb->aiocb.aio_fildes = s->fd; acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; acb->aiocb.aio_buf = buf; @@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) break; } else if (*pacb == acb) { *pacb = acb->next; - raw_fd_pool_put(acb); qemu_aio_release(acb); break; } pacb = &acb->next; } } - #else /* CONFIG_AIO */ static int posix_aio_init(void) { @@ -753,17 +694,6 @@ static int posix_aio_init(void) } #endif /* CONFIG_AIO */ -static void raw_close_fd_pool(BDRVRawState *s) -{ - int i; - - for (i = 0; i < RAW_FD_POOL_SIZE; i++) { - if (s->fd_pool[i] != -1) { - close(s->fd_pool[i]); - s->fd_pool[i] = -1; - } - } -} static void raw_close(BlockDriverState *bs) { @@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs) if (s->aligned_buf != NULL) qemu_free(s->aligned_buf); } - raw_close_fd_pool(s); } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -895,6 +824,7 @@ BlockDriver bdrv_raw = { .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif + .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, .bdrv_truncate = raw_truncate, @@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma static int hdev_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; - int fd, open_flags, ret, i; + int fd, open_flags, ret; posix_aio_init(); @@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) return ret; } s->fd = fd; - for (i = 0; i < RAW_FD_POOL_SIZE; i++) - s->fd_pool[i] = -1; #if defined(__linux__) /* close fd so that we can reopen it as needed */ if (s->type == FTYPE_FD) { @@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs) (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { close(s->fd); s->fd = -1; - raw_close_fd_pool(s); #ifdef DEBUG_FLOPPY printf("Floppy closed\n"); #endif @@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) if (s->fd >= 0) { close(s->fd); s->fd = -1; - raw_close_fd_pool(s); } fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); if (fd >= 0) { @@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = { .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif + .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, .bdrv_getlength = raw_getlength, diff --git a/configure b/configure index 7f82786..146b3fc 100755 --- a/configure +++ b/configure @@ -152,7 +152,6 @@ FreeBSD) bsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd pa" -aio_lib="-lpthread" if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then kqemu="yes" fi @@ -162,7 +161,6 @@ bsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd" oss_lib="-lossaudio" -aio_lib="-lrt -lpthread" ;; OpenBSD) bsd="yes" @@ -170,7 +168,6 @@ openbsd="yes" audio_drv_list="oss" audio_possible_drivers="oss sdl esd" oss_lib="-lossaudio" -aio_lib="-lpthread" ;; Darwin) bsd="yes" @@ -181,7 +178,6 @@ audio_drv_list="coreaudio" audio_possible_drivers="coreaudio sdl fmod" OS_CFLAGS="-mdynamic-no-pic" OS_LDFLAGS="-framework CoreFoundation -framework IOKit" -aio_lib="-lpthread" ;; SunOS) solaris="yes" @@ -533,15 +529,6 @@ if test "$mingw32" = "yes" ; then bsd_user="no" fi -if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then - AIOLIBS= -elif [ "$bsd" = "yes" ]; then - AIOLIBS="$aio_lib" -else - # Some Linux architectures (e.g. s390) don't imply -lpthread automatically. - AIOLIBS="-lrt -lpthread" -fi - # Check for gcc4, error if pre-gcc4 if test "$check_gcc" = "yes" ; then cat > $TMPC < $TMPC << EOF -#include -int main(void) { return aio_write(NULL); } +#include +int main(void) { pthread_mutex_t lock; return 0; } EOF if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then aio=yes + AIOLIBS="-lpthread" fi fi diff --git a/posix-aio-compat.c b/posix-aio-compat.c new file mode 100644 index 0000000..c21d579 --- /dev/null +++ b/posix-aio-compat.c @@ -0,0 +1,202 @@ +/* + * QEMU posix-aio emulation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include "osdep.h" + +#include "posix-aio-compat.h" + +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +static pthread_t thread_id; +static int max_threads = 64; +static int cur_threads = 0; +static int idle_threads = 0; +static TAILQ_HEAD(, aiocb) request_list; + +static void *aio_thread(void *unused) +{ + sigset_t set; + + /* block all signals */ + sigfillset(&set); + sigprocmask(SIG_BLOCK, &set, NULL); + + while (1) { + struct aiocb *aiocb; + size_t offset; + int ret = 0; + + pthread_mutex_lock(&lock); + + while (TAILQ_EMPTY(&request_list) && + !(ret == ETIMEDOUT)) { + struct timespec ts = { 0 }; + qemu_timeval tv; + + qemu_gettimeofday(&tv); + ts.tv_sec = tv.tv_sec + 10; + ret = pthread_cond_timedwait(&cond, &lock, &ts); + } + + if (ret == ETIMEDOUT) + break; + + aiocb = TAILQ_FIRST(&request_list); + TAILQ_REMOVE(&request_list, aiocb, node); + + offset = 0; + aiocb->active = 1; + + idle_threads--; + pthread_mutex_unlock(&lock); + + while (offset < aiocb->aio_nbytes) { + ssize_t len; + + if (aiocb->is_write) + len = pwrite(aiocb->aio_fildes, + (const char *)aiocb->aio_buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + else + len = pread(aiocb->aio_fildes, + (char *)aiocb->aio_buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + + if (len == -1 && errno == EINTR) + continue; + else if (len == -1) { + pthread_mutex_lock(&lock); + aiocb->ret = -errno; + pthread_mutex_unlock(&lock); + break; + } else if (len == 0) + break; + + offset += len; + + pthread_mutex_lock(&lock); + aiocb->ret = offset; + pthread_mutex_unlock(&lock); + } + + pthread_mutex_lock(&lock); + idle_threads++; + pthread_mutex_unlock(&lock); + + sigqueue(getpid(), + aiocb->aio_sigevent.sigev_signo, + aiocb->aio_sigevent.sigev_value); + } + + idle_threads--; + cur_threads--; + pthread_mutex_unlock(&lock); + + return NULL; +} + +static int spawn_thread(void) +{ + pthread_attr_t attr; + int ret; + + cur_threads++; + idle_threads++; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + ret = pthread_create(&thread_id, &attr, aio_thread, NULL); + pthread_attr_destroy(&attr); + + return ret; +} + +int _compat_aio_init(struct aioinit *aioinit) +{ + TAILQ_INIT(&request_list); + + return 0; +} + +static int _compat_aio_submit(struct aiocb *aiocb, int is_write) +{ + aiocb->is_write = is_write; + aiocb->ret = -EINPROGRESS; + aiocb->active = 0; + pthread_mutex_lock(&lock); + if (idle_threads == 0 && cur_threads < max_threads) + spawn_thread(); + TAILQ_INSERT_TAIL(&request_list, aiocb, node); + pthread_mutex_unlock(&lock); + pthread_cond_broadcast(&cond); + + return 0; +} + +int _compat_aio_read(struct aiocb *aiocb) +{ + return _compat_aio_submit(aiocb, 0); +} + +int _compat_aio_write(struct aiocb *aiocb) +{ + return _compat_aio_submit(aiocb, 1); +} + +ssize_t _compat_aio_return(struct aiocb *aiocb) +{ + ssize_t ret; + + pthread_mutex_lock(&lock); + ret = aiocb->ret; + pthread_mutex_unlock(&lock); + + return ret; +} + +int _compat_aio_error(struct aiocb *aiocb) +{ + ssize_t ret = _compat_aio_return(aiocb); + + if (ret < 0) + ret = -ret; + else + ret = 0; + + return ret; +} + +int _compat_aio_cancel(int fd, struct aiocb *aiocb) +{ + int ret; + + pthread_mutex_lock(&lock); + if (!aiocb->active) { + TAILQ_REMOVE(&request_list, aiocb, node); + aiocb->ret = -ECANCELED; + ret = AIO_CANCELED; + } else if (aiocb->ret == -EINPROGRESS) + ret = AIO_NOTCANCELED; + else + ret = AIO_ALLDONE; + pthread_mutex_unlock(&lock); + + return ret; +} + diff --git a/posix-aio-compat.h b/posix-aio-compat.h new file mode 100644 index 0000000..c8fcb0e --- /dev/null +++ b/posix-aio-compat.h @@ -0,0 +1,86 @@ +/* + * QEMU posix-aio emulation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_POSIX_AIO_COMPAT_H +#define QEMU_POSIX_AIO_COMPAT_H + +#include +#include +#include + +#include "sys-queue.h" + +#define AIO_CANCELED 0x01 +#define AIO_NOTCANCELED 0x02 +#define AIO_ALLDONE 0x03 + +struct aiocb +{ + int aio_fildes; + void *aio_buf; + size_t aio_nbytes; + struct sigevent aio_sigevent; + off_t aio_offset; + + /* private */ + TAILQ_ENTRY(aiocb) node; + int is_write; + ssize_t ret; + int active; +}; + +struct aioinit +{ + int aio_threads; + int aio_num; + int aio_idle_time; +}; + +int _compat_aio_init(struct aioinit *aioinit); +int _compat_aio_read(struct aiocb *aiocb); +int _compat_aio_write(struct aiocb *aiocb); +int _compat_aio_error(struct aiocb *aiocb); +ssize_t _compat_aio_return(struct aiocb *aiocb); +int _compat_aio_cancel(int fd, struct aiocb *aiocb); + +static inline int aio_init(struct aioinit *aioinit) +{ + return _compat_aio_init(aioinit); +} + +static inline int aio_read(struct aiocb *aiocb) +{ + return _compat_aio_read(aiocb); +} + +static inline int aio_write(struct aiocb *aiocb) +{ + return _compat_aio_write(aiocb); +} + +static inline int aio_error(struct aiocb *aiocb) +{ + return _compat_aio_error(aiocb); +} + +static inline ssize_t aio_return(struct aiocb *aiocb) +{ + return _compat_aio_return(aiocb); +} + +static inline int aio_cancel(int fd, struct aiocb *aiocb) +{ + return _compat_aio_cancel(fd, aiocb); +} + +#endif