* [PATCH] qemu-kvm/rbd: small fixes and cosmetics
@ 2010-04-14 20:49 Christian Brunner
2010-04-14 21:12 ` Yehuda Sadeh Weinraub
2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
0 siblings, 2 replies; 5+ messages in thread
From: Christian Brunner @ 2010-04-14 20:49 UTC (permalink / raw)
To: ceph-devel
Attached is a patch with small fixes and cosmetics for the qemu-kvm/rbd
driver:
- conditional linking of librados
- reduced header size
- followed the variable names of the kernel driver
Could you put it in the git repository, please?
Christian
---
Makefile | 2 ++
block/rbd.c | 56 ++++++++++++++++++++++++++++----------------------------
2 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/Makefile b/Makefile
index 6280193..b96743d 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,9 @@ configure: ;
$(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
LIBS+=-lz $(LIBS_TOOLS)
+ifdef CONFIG_RBD
LIBS+=-lrados
+endif
ifdef BUILD_DOCS
DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
diff --git a/block/rbd.c b/block/rbd.c
index 3513e48..9127bfa 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -46,14 +46,14 @@
#define OBJ_DEFAULT_ORDER 22 // 22 Bit = 4MB (as default)
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_ORDER)
+#define RBD_MAX_OBJ_NAME_SIZE 96
+#define RBD_MAX_SEG_NAME_SIZE 128
-#define MAX_NAME 128 // Maximum size of the poolname plus objectname
-#define MAX_SNAPS 4096 // Maximum number of snapshots (unsupported at
- // the moment)
+#define RBD_SUFFIX ".rbd"
typedef struct RBDRVRBDState {
rados_pool_t pool;
- char name[MAX_NAME];
+ char name[RBD_MAX_OBJ_NAME_SIZE];
int name_len;
uint64_t size;
uint64_t objsize;
@@ -70,13 +70,13 @@ typedef struct {
char text[64];
char signature[4];
char version[8];
- uint64_t imagesize;
- uint8_t objorder;
+ uint64_t image_size;
+ uint8_t obj_order;
uint8_t crypt_type;
uint8_t comp_type; // unsupported at the moment
uint64_t snap_seq; // unsupported at the moment
uint16_t snap_count; // unsupported at the moment
- uint64_t snap_id[MAX_SNAPS]; // unsupported at the moment
+ uint64_t snap_id[0]; // unsupported at the moment
} __attribute__((packed)) RbdHeader1;
static int rbd_parsename(const char *filename, char *pool, char *name) {
@@ -88,7 +88,7 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
return -EINVAL;
}
- pstrcpy(pool, MAX_NAME, rbdname);
+ pstrcpy(pool, 2*RBD_MAX_SEG_NAME_SIZE, rbdname);
p = strchr(pool, '/');
if (p == NULL) {
return -EINVAL;
@@ -99,7 +99,7 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
l = strlen(n);
- if (l > MAX_NAME-14) {
+ if (l > RBD_MAX_OBJ_NAME_SIZE) {
fprintf(stderr, "object name to long\n");
return -EINVAL;
} else if (l <= 0) {
@@ -115,10 +115,10 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
static int rbd_create(const char *filename, QEMUOptionParameter *options) {
int64_t bytes = 0;
int64_t objsize;
- uint8_t objorder = OBJ_DEFAULT_ORDER;
- char pool[MAX_NAME];
- char n[MAX_NAME];
- char name[MAX_NAME];
+ uint8_t obj_order = OBJ_DEFAULT_ORDER;
+ char pool[RBD_MAX_SEG_NAME_SIZE];
+ char n[RBD_MAX_SEG_NAME_SIZE];
+ char name[RBD_MAX_SEG_NAME_SIZE];
RbdHeader1 header;
rados_pool_t p;
int name_len;
@@ -127,7 +127,7 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
return -EINVAL;
}
- snprintf(n, MAX_NAME, "%s.rbd", name);
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
/* Read out options */
while (options && options->name) {
@@ -145,7 +145,7 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
return -EINVAL;
}
- for (objorder=0; objorder<64; objorder++) {
+ for (obj_order=0; obj_order<64; obj_order++) {
if (objsize == 1)
break;
objsize >>= 1;
@@ -159,9 +159,9 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
pstrcpy(header.text, sizeof(header.text), RBD_TEXT);
pstrcpy(header.signature, sizeof(header.signature), RBD_SIGNATURE);
pstrcpy(header.version, sizeof(header.version), RBD_VERSION1);
- header.imagesize = bytes;
- cpu_to_le64s(&header.imagesize);
- header.objorder = objorder;
+ header.image_size = bytes;
+ cpu_to_le64s(&header.image_size);
+ header.obj_order = obj_order;
header.crypt_type = CRYPT_NONE;
header.comp_type = COMP_NONE;
header.snap_seq = 0;
@@ -190,14 +190,14 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
static int rbd_open(BlockDriverState *bs, const char *filename, int flags) {
RBDRVRBDState *s = bs->opaque;
- char pool[MAX_NAME];
- char n[MAX_NAME];
+ char pool[RBD_MAX_SEG_NAME_SIZE];
+ char n[RBD_MAX_SEG_NAME_SIZE];
char hbuf[4096];
if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
return -EINVAL;
}
- snprintf(n, MAX_NAME, "%s.rbd", s->name);
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
if (rados_initialize(0, NULL) < 0) {
fprintf(stderr, "error initializing\n");
@@ -218,9 +218,9 @@ static int rbd_open(BlockDriverState *bs, const char *filename, int flags) {
RbdHeader1 *header;
header = (RbdHeader1 *) hbuf;
- le64_to_cpus(&header->imagesize);
- s->size = header->imagesize;
- s->objsize = 1 << header->objorder;
+ le64_to_cpus(&header->image_size);
+ s->size = header->image_size;
+ s->objsize = 1 << header->obj_order;
} else {
fprintf(stderr, "Unknown image version %s\n", hbuf+68);
return -EIO;
@@ -243,7 +243,7 @@ static void rbd_close(BlockDriverState *bs) {
static int rbd_write(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors) {
RBDRVRBDState *s = bs->opaque;
- char n[MAX_NAME];
+ char n[RBD_MAX_SEG_NAME_SIZE];
int64_t segnr, segoffs, segsize;
int64_t off, size;
@@ -259,7 +259,7 @@ static int rbd_write(BlockDriverState *bs, int64_t sector_num,
segsize = size;
}
- snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long unsigned int) segnr);
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
if (rados_write(s->pool, n, segoffs, (const char *) buf, segsize) < 0) {
return -errno;
@@ -278,7 +278,7 @@ static int rbd_write(BlockDriverState *bs, int64_t sector_num,
static int rbd_read(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors) {
RBDRVRBDState *s = bs->opaque;
- char n[MAX_NAME];
+ char n[RBD_MAX_SEG_NAME_SIZE];
int64_t segnr, segoffs, segsize, r;
int64_t off, size;
@@ -294,7 +294,7 @@ static int rbd_read(BlockDriverState *bs, int64_t sector_num,
segsize = size;
}
- snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long unsigned int) segnr);
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
r = rados_read(s->pool, n, segoffs, (char *) buf, segsize);
if (r < 0) {
--
1.6.6.1
--
Christian Brunner MUC.DE e.V.
Joseph-Dollinger-Bogen 14
D-80807 Muenchen
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] qemu-kvm/rbd: small fixes and cosmetics
2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
@ 2010-04-14 21:12 ` Yehuda Sadeh Weinraub
2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
1 sibling, 0 replies; 5+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-04-14 21:12 UTC (permalink / raw)
To: Christian Brunner; +Cc: ceph-devel
On Wed, Apr 14, 2010 at 1:49 PM, Christian Brunner <chb@muc.de> wrote:
>
> Attached is a patch with small fixes and cosmetics for the qemu-kvm/rbd
> driver:
>
> - conditional linking of librados
> - reduced header size
> - followed the variable names of the kernel driver
>
> Could you put it in the git repository, please?
Pushed, thanks!
Yehuda
^ permalink raw reply [flat|nested] 5+ messages in thread
* qemu-kvm/rbd: first attempt to implement aio
2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
2010-04-14 21:12 ` Yehuda Sadeh Weinraub
@ 2010-04-15 20:24 ` Christian Brunner
2010-04-16 0:09 ` Yehuda Sadeh Weinraub
1 sibling, 1 reply; 5+ messages in thread
From: Christian Brunner @ 2010-04-15 20:24 UTC (permalink / raw)
To: ceph-devel
[-- Attachment #1: Type: text/plain, Size: 408 bytes --]
To increase the performance of the qemu rbd driver I started to implement
aio functions. Attached is an intial aio implementation. However it is
missing the callback of rados_aio_read or _write from time to time.
This leads to a lockup of the disk driver until the guest OS is
perfomring a bus reset.
If someone has an explanation for the cause of the lost callbacks. I
would be happy.
Regards, Christian
[-- Attachment #2: rbd-aio.patch --]
[-- Type: text/plain, Size: 6281 bytes --]
diff --git a/block/rbd.c b/block/rbd.c
index f68b440..5af2c05 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -47,6 +47,17 @@
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
+typedef struct RBDAIOCB {
+ BlockDriverAIOCB common;
+ QEMUIOVector *qiov;
+ int64_t sector_num;
+ int nb_sectors;
+ QEMUBH *bh;
+ int write;
+ char *buf;
+ int ret;
+ int aiocnt;
+} RBDAIOCB;
typedef struct RBDRVRBDState {
rados_pool_t pool;
@@ -341,6 +352,192 @@ static int rbd_read(BlockDriverState *bs, int64_t sector_num,
return(0);
}
+static int rbd_schedule_bh(QEMUBHFunc *cb, RBDAIOCB *acb) {
+ if (acb->bh) {
+ fprintf(stderr, "rbd_schedule_bh: bh should be NULL\n");
+ return -EIO;
+ }
+
+ acb->bh = qemu_bh_new(cb, acb);
+ if (!acb->bh) {
+ fprintf(stderr, "rbd_schedule_bh: qemu_bh_new failed\n");
+ return -EIO;
+ }
+
+ qemu_bh_schedule(acb->bh);
+
+ return 0;
+}
+
+static void rbd_finish_aiocb(rados_completion_t c, RBDAIOCB *acb) {
+ if (rados_aio_is_complete(c) && !rados_aio_is_safe(c)) {
+ acb->aiocnt--;
+ if (acb->write == 0) {
+ fprintf(stderr, "rbd_finish_aiocb: read complete - aiocnt=%i\n", acb->aiocnt);
+ } else {
+ fprintf(stderr, "rbd_finish_aiocb: write complete - aiocnt=%i\n", acb->aiocnt);
+ }
+ } else {
+ //fprintf(stderr, "rbd_finish_aiocb: returning\n");
+ return;
+ }
+ if (acb->aiocnt == 0) {
+ if (acb->write == 0) {
+ qemu_iovec_from_buffer(acb->qiov, acb->buf, acb->nb_sectors*512);
+ }
+ acb->common.cb(acb->common.opaque, acb->ret);
+ //qemu_free(acb->buf);
+ qemu_aio_release(acb);
+ }
+}
+
+static void rbd_readv_bh_cb(void *p)
+{
+ RBDAIOCB *acb = p;
+ RBDRVRBDState *s = acb->common.bs->opaque;
+ rados_completion_t c;
+ char *buf;
+ char n[RBD_MAX_SEG_NAME_SIZE];
+ int64_t segnr, segoffs, segsize;
+ int64_t off, size;
+
+ off = acb->sector_num * 512;
+ size = acb->nb_sectors * 512;
+ segnr = (int64_t) (off / s->objsize);
+ segoffs = (int64_t) (off % s->objsize);
+ segsize = (int64_t) (s->objsize - segoffs);
+
+ buf = acb->buf;
+
+ while (size > 0) {
+ if (size < segsize) {
+ segsize = size;
+ }
+
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
+ acb->aiocnt++;
+ fprintf(stderr, "rbd_readv_bh_cb: aiocnt=%i: reading %s (segoffs: %lld, segsize %lld)\n", acb->aiocnt, n, (long long int) segoffs, (long long int) segsize);
+ rados_aio_read(s->pool, n, segoffs, (char *) buf, segsize, &c);
+ rados_aio_set_callback(c, (rados_callback_t) rbd_finish_aiocb, acb);
+ buf += segsize;
+ size -= segsize;
+ segoffs = 0;
+ segsize = s->objsize;
+ segnr++;
+ }
+
+ return;
+}
+
+static void rbd_writev_bh_cb(void *p)
+{
+ RBDAIOCB *acb = p;
+ RBDRVRBDState *s = acb->common.bs->opaque;
+ rados_completion_t c;
+ char *buf;
+ char n[RBD_MAX_SEG_NAME_SIZE];
+ int64_t segnr, segoffs, segsize;
+ int64_t off, size;
+
+ off = acb->sector_num * 512;
+ size = acb->nb_sectors * 512;
+ segnr = (int64_t) (off / s->objsize);
+ segoffs = (int64_t) (off % s->objsize);
+ segsize = (int64_t) (s->objsize - segoffs);
+
+ buf = malloc(acb->qiov->size);
+ if (!buf) {
+ /* XXX: error */
+ }
+
+ qemu_iovec_to_buffer(acb->qiov, buf);
+
+ while (size > 0) {
+ if (size < segsize) {
+ segsize = size;
+ }
+
+ snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
+ acb->aiocnt++;
+ fprintf(stderr, "rbd_readv_bh_cb: aiocnt=%i: writing %s (segoffs: %lld, segsize %lld)\n", acb->aiocnt, n, (long long int) segoffs, (long long int) segsize);
+ rados_aio_write(s->pool, n, segoffs, (char *) buf, segsize, &c);
+ rados_aio_set_callback(c, (rados_callback_t) rbd_finish_aiocb, acb);
+
+ buf += segsize;
+ size -= segsize;
+ segoffs = 0;
+ segsize = s->objsize;
+ segnr++;
+ }
+
+ return;
+}
+
+static void rbd_aio_cancel(BlockDriverAIOCB *blockacb) {
+ // Do we have to implement canceling? Seems to work without...
+}
+
+static AIOPool rbd_aio_pool = {
+ .aiocb_size = sizeof(RBDAIOCB),
+ .cancel = rbd_aio_cancel,
+};
+
+static RBDAIOCB *rbd_aio_setup(BlockDriverState *bs,
+ int write, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque) {
+ RBDAIOCB *acb;
+
+ acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+ acb->qiov = qiov;
+ acb->sector_num = sector_num;
+ acb->nb_sectors = nb_sectors;
+ acb->write = write;
+ acb->bh = NULL;
+ acb->aiocnt = 0;
+
+ if(acb->buf) {
+ qemu_free(acb->buf);
+ }
+ acb->buf = qemu_malloc(nb_sectors * 512);
+ if(write == 0) {
+ memset(acb->buf, 0, nb_sectors * 512);
+ }
+
+ return(acb);
+}
+
+static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ RBDAIOCB *acb;
+
+ acb = rbd_aio_setup(bs, 0, qiov, sector_num, nb_sectors, cb, opaque);
+
+ rbd_schedule_bh(rbd_readv_bh_cb, acb);
+ return &acb->common;
+}
+
+
+
+static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ RBDAIOCB *acb;
+
+ acb = rbd_aio_setup(bs, 1, qiov, sector_num, nb_sectors, cb, opaque);
+
+ rbd_schedule_bh(rbd_writev_bh_cb, acb);
+ return &acb->common;
+}
+
+
static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
{
RBDRVRBDState *s = bs->opaque;
@@ -381,6 +578,9 @@ static BlockDriver bdrv_rbd = {
.create_options = rbd_create_options,
.bdrv_getlength = rbd_getlength,
.protocol_name = "rbd",
+
+ .bdrv_aio_readv = rbd_aio_readv,
+ .bdrv_aio_writev= rbd_aio_writev,
};
static void bdrv_rbd_init(void) {
@@ -389,4 +589,3 @@ static void bdrv_rbd_init(void) {
block_init(bdrv_rbd_init);
-
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: qemu-kvm/rbd: first attempt to implement aio
2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
@ 2010-04-16 0:09 ` Yehuda Sadeh Weinraub
2010-04-16 20:53 ` AW: " Christian Brunner
0 siblings, 1 reply; 5+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-04-16 0:09 UTC (permalink / raw)
To: Christian Brunner; +Cc: ceph-devel
On Thu, Apr 15, 2010 at 1:24 PM, Christian Brunner <chb@muc.de> wrote:
> To increase the performance of the qemu rbd driver I started to implement
> aio functions. Attached is an intial aio implementation. However it is
> missing the callback of rados_aio_read or _write from time to time.
> This leads to a lockup of the disk driver until the guest OS is
> perfomring a bus reset.
>
> If someone has an explanation for the cause of the lost callbacks. I
> would be happy.
>
Yeah, it looks like the rados aio interface is racy. You should be
able to first set aio completion and only then to do the read/write
and not the other way around as it is now. I've just pushed a fix to
the unstable branch that changes the interface a bit. Now you first
need to do a rados_aio_create_completion() first and use it in the
rados_aio_read()/write() call. Note that I haven't tested it yet, so
let me know if there's any problem.
Thanks,
Yehuda
^ permalink raw reply [flat|nested] 5+ messages in thread
* AW: qemu-kvm/rbd: first attempt to implement aio
2010-04-16 0:09 ` Yehuda Sadeh Weinraub
@ 2010-04-16 20:53 ` Christian Brunner
0 siblings, 0 replies; 5+ messages in thread
From: Christian Brunner @ 2010-04-16 20:53 UTC (permalink / raw)
To: Yehuda Sadeh Weinraub; +Cc: ceph-devel
> Yeah, it looks like the rados aio interface is racy. You should be
> able to first set aio completion and only then to do the read/write
> and not the other way around as it is now. I've just pushed a fix to
> the unstable branch that changes the interface a bit. Now you first
> need to do a rados_aio_create_completion() first and use it in the
> rados_aio_read()/write() call. Note that I haven't tested it yet, so
> let me know if there's any problem.
>
> Thanks,
> Yehuda
Thank you - the race condition seems to be fixed.
I'm still experiencing some cases where the callback function is being
called a few hundred times, but I've to dig into this a little bit,
before I can tell about it.
Christian
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2010-04-16 21:19 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
2010-04-14 21:12 ` Yehuda Sadeh Weinraub
2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
2010-04-16 0:09 ` Yehuda Sadeh Weinraub
2010-04-16 20:53 ` AW: " Christian Brunner
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.