All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] qemu-kvm/rbd: small fixes and cosmetics
@ 2010-04-14 20:49 Christian Brunner
  2010-04-14 21:12 ` Yehuda Sadeh Weinraub
  2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
  0 siblings, 2 replies; 5+ messages in thread
From: Christian Brunner @ 2010-04-14 20:49 UTC (permalink / raw)
  To: ceph-devel

Attached is a patch with small fixes and cosmetics for the qemu-kvm/rbd 
driver:

- conditional linking of librados
- reduced header size
- followed the variable names of the kernel driver

Could you put it in the git repository, please?

Christian
---
 Makefile    |    2 ++
 block/rbd.c |   56 ++++++++++++++++++++++++++++----------------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index 6280193..b96743d 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,9 @@ configure: ;
 $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
 
 LIBS+=-lz $(LIBS_TOOLS)
+ifdef CONFIG_RBD
 LIBS+=-lrados
+endif
 
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
diff --git a/block/rbd.c b/block/rbd.c
index 3513e48..9127bfa 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -46,14 +46,14 @@
 
 #define OBJ_DEFAULT_ORDER 22	// 22 Bit = 4MB (as default)
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_ORDER)
+#define RBD_MAX_OBJ_NAME_SIZE   96
+#define RBD_MAX_SEG_NAME_SIZE   128
 
-#define MAX_NAME 128 		// Maximum size of the poolname plus objectname
-#define MAX_SNAPS 4096		// Maximum number of snapshots (unsupported at
-				// the moment)
+#define RBD_SUFFIX ".rbd"
 
 typedef struct RBDRVRBDState {
 	rados_pool_t pool;
-	char name[MAX_NAME];
+	char name[RBD_MAX_OBJ_NAME_SIZE];
 	int name_len;
 	uint64_t size;
 	uint64_t objsize;
@@ -70,13 +70,13 @@ typedef struct {
 	char text[64];
 	char signature[4];
 	char version[8];
-	uint64_t imagesize;
-	uint8_t objorder;
+	uint64_t image_size;
+	uint8_t obj_order;
 	uint8_t crypt_type;
 	uint8_t comp_type;			// unsupported at the moment
 	uint64_t snap_seq;			// unsupported at the moment
 	uint16_t snap_count;			// unsupported at the moment
-	uint64_t snap_id[MAX_SNAPS];	// unsupported at the moment
+	uint64_t snap_id[0];			// unsupported at the moment
 } __attribute__((packed)) RbdHeader1;
 
 static int rbd_parsename(const char *filename, char *pool, char *name) {
@@ -88,7 +88,7 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
 		return -EINVAL;
 	}
 		
-	pstrcpy(pool, MAX_NAME, rbdname);
+	pstrcpy(pool, 2*RBD_MAX_SEG_NAME_SIZE, rbdname);
 	p = strchr(pool, '/');
 	if (p == NULL) {
 		return -EINVAL;
@@ -99,7 +99,7 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
 
 	l = strlen(n);
 
-	if (l > MAX_NAME-14) {
+	if (l > RBD_MAX_OBJ_NAME_SIZE) {
 		fprintf(stderr, "object name to long\n");
 		return -EINVAL;
 	} else if (l <= 0) {
@@ -115,10 +115,10 @@ static int rbd_parsename(const char *filename, char *pool, char *name) {
 static int rbd_create(const char *filename, QEMUOptionParameter *options) {
 	int64_t bytes = 0;
 	int64_t objsize;
-	uint8_t objorder = OBJ_DEFAULT_ORDER;
-	char pool[MAX_NAME];
-	char n[MAX_NAME];
-	char name[MAX_NAME];
+	uint8_t obj_order = OBJ_DEFAULT_ORDER;
+	char pool[RBD_MAX_SEG_NAME_SIZE];
+	char n[RBD_MAX_SEG_NAME_SIZE];
+	char name[RBD_MAX_SEG_NAME_SIZE];
 	RbdHeader1 header;
 	rados_pool_t p;
 	int name_len;
@@ -127,7 +127,7 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
 		return -EINVAL;
 	}
 
-	snprintf(n, MAX_NAME, "%s.rbd", name);
+	snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
 
 	/* Read out options */
 	while (options && options->name) {
@@ -145,7 +145,7 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
 					return -EINVAL;
 				}
 
-				for (objorder=0; objorder<64; objorder++) {
+				for (obj_order=0; obj_order<64; obj_order++) {
 					if (objsize == 1)
 						break;
 					objsize >>= 1;
@@ -159,9 +159,9 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
 	pstrcpy(header.text, sizeof(header.text), RBD_TEXT);
 	pstrcpy(header.signature, sizeof(header.signature), RBD_SIGNATURE);
 	pstrcpy(header.version, sizeof(header.version), RBD_VERSION1);
-	header.imagesize = bytes;
-	cpu_to_le64s(&header.imagesize);
-	header.objorder = objorder;
+	header.image_size = bytes;
+	cpu_to_le64s(&header.image_size);
+	header.obj_order = obj_order;
 	header.crypt_type = CRYPT_NONE;
 	header.comp_type = COMP_NONE;
 	header.snap_seq = 0;
@@ -190,14 +190,14 @@ static int rbd_create(const char *filename, QEMUOptionParameter *options) {
 
 static int rbd_open(BlockDriverState *bs, const char *filename, int flags) {
 	RBDRVRBDState *s = bs->opaque;
-	char pool[MAX_NAME];
-	char n[MAX_NAME];
+	char pool[RBD_MAX_SEG_NAME_SIZE];
+	char n[RBD_MAX_SEG_NAME_SIZE];
 	char hbuf[4096];
 
 	if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
 		return -EINVAL;
 	}
-	snprintf(n, MAX_NAME, "%s.rbd", s->name);
+	snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
 	
         if (rados_initialize(0, NULL) < 0) {
                 fprintf(stderr, "error initializing\n");
@@ -218,9 +218,9 @@ static int rbd_open(BlockDriverState *bs, const char *filename, int flags) {
 			RbdHeader1 *header;
 
 			header = (RbdHeader1 *) hbuf;
-			le64_to_cpus(&header->imagesize);
-			s->size = header->imagesize;
-			s->objsize = 1 << header->objorder;
+			le64_to_cpus(&header->image_size);
+			s->size = header->image_size;
+			s->objsize = 1 << header->obj_order;
 		} else {
                 	fprintf(stderr, "Unknown image version %s\n", hbuf+68);
 			return -EIO;
@@ -243,7 +243,7 @@ static void rbd_close(BlockDriverState *bs) {
 static int rbd_write(BlockDriverState *bs, int64_t sector_num, 
 		const uint8_t *buf, int nb_sectors) {
 	RBDRVRBDState *s = bs->opaque;
-	char n[MAX_NAME];
+	char n[RBD_MAX_SEG_NAME_SIZE];
 
 	int64_t segnr, segoffs, segsize;
 	int64_t off, size;
@@ -259,7 +259,7 @@ static int rbd_write(BlockDriverState *bs, int64_t sector_num,
 			segsize = size;
 		}
 
-		snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long unsigned int) segnr);
+		snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
 
 		if (rados_write(s->pool, n, segoffs, (const char *) buf, segsize) < 0) {
 			return -errno;
@@ -278,7 +278,7 @@ static int rbd_write(BlockDriverState *bs, int64_t sector_num,
 static int rbd_read(BlockDriverState *bs, int64_t sector_num, 
 		uint8_t *buf, int nb_sectors) {
 	RBDRVRBDState *s = bs->opaque;
-	char n[MAX_NAME];
+	char n[RBD_MAX_SEG_NAME_SIZE];
 
 	int64_t segnr, segoffs, segsize, r;
 	int64_t off, size;
@@ -294,7 +294,7 @@ static int rbd_read(BlockDriverState *bs, int64_t sector_num,
 			segsize = size;
 		}
 
-		snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long unsigned int) segnr);
+		snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
 
 		r = rados_read(s->pool, n, segoffs, (char *) buf, segsize);
 		if (r < 0) {
-- 
1.6.6.1



-- 
Christian Brunner                              MUC.DE e.V.
                                               Joseph-Dollinger-Bogen 14
                                               D-80807 Muenchen

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] qemu-kvm/rbd: small fixes and cosmetics
  2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
@ 2010-04-14 21:12 ` Yehuda Sadeh Weinraub
  2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
  1 sibling, 0 replies; 5+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-04-14 21:12 UTC (permalink / raw)
  To: Christian Brunner; +Cc: ceph-devel

On Wed, Apr 14, 2010 at 1:49 PM, Christian Brunner <chb@muc.de> wrote:
>
> Attached is a patch with small fixes and cosmetics for the qemu-kvm/rbd
> driver:
>
> - conditional linking of librados
> - reduced header size
> - followed the variable names of the kernel driver
>
> Could you put it in the git repository, please?

Pushed, thanks!
Yehuda

^ permalink raw reply	[flat|nested] 5+ messages in thread

* qemu-kvm/rbd: first attempt to implement aio
  2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
  2010-04-14 21:12 ` Yehuda Sadeh Weinraub
@ 2010-04-15 20:24 ` Christian Brunner
  2010-04-16  0:09   ` Yehuda Sadeh Weinraub
  1 sibling, 1 reply; 5+ messages in thread
From: Christian Brunner @ 2010-04-15 20:24 UTC (permalink / raw)
  To: ceph-devel

[-- Attachment #1: Type: text/plain, Size: 408 bytes --]

To increase the performance of the qemu rbd driver I started to implement
aio functions. Attached is an intial aio implementation. However it is
missing the callback of rados_aio_read or _write from time to time.
This leads to a lockup of the disk driver until the guest OS is 
perfomring a bus reset.

If someone has an explanation for the cause of the lost callbacks. I
would be happy.

Regards, Christian

[-- Attachment #2: rbd-aio.patch --]
[-- Type: text/plain, Size: 6281 bytes --]

diff --git a/block/rbd.c b/block/rbd.c
index f68b440..5af2c05 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -47,6 +47,17 @@
 
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
 
+typedef struct RBDAIOCB {
+        BlockDriverAIOCB common;
+        QEMUIOVector *qiov;
+        int64_t sector_num;
+        int nb_sectors;
+        QEMUBH *bh;
+        int write;
+        char *buf;
+        int ret;
+        int aiocnt;
+} RBDAIOCB;
 
 typedef struct RBDRVRBDState {
 	rados_pool_t pool;
@@ -341,6 +352,192 @@ static int rbd_read(BlockDriverState *bs, int64_t sector_num,
 	return(0);
 }
 
+static int rbd_schedule_bh(QEMUBHFunc *cb, RBDAIOCB *acb) {
+        if (acb->bh) {
+                fprintf(stderr, "rbd_schedule_bh: bh should be NULL\n");
+                return -EIO;
+        }
+
+        acb->bh = qemu_bh_new(cb, acb);
+        if (!acb->bh) {
+                fprintf(stderr, "rbd_schedule_bh: qemu_bh_new failed\n");
+                return -EIO;
+        }
+
+        qemu_bh_schedule(acb->bh);
+
+        return 0;
+}
+
+static void rbd_finish_aiocb(rados_completion_t c, RBDAIOCB *acb) {
+	if (rados_aio_is_complete(c) && !rados_aio_is_safe(c)) {
+		acb->aiocnt--;
+		if (acb->write == 0) {
+			fprintf(stderr, "rbd_finish_aiocb: read complete - aiocnt=%i\n", acb->aiocnt);
+		} else {
+			fprintf(stderr, "rbd_finish_aiocb: write complete - aiocnt=%i\n", acb->aiocnt);
+		}
+	} else {
+		//fprintf(stderr, "rbd_finish_aiocb: returning\n");
+		return;
+	}
+	if (acb->aiocnt == 0) {
+		if (acb->write == 0) {
+			qemu_iovec_from_buffer(acb->qiov, acb->buf, acb->nb_sectors*512);
+		}
+	        acb->common.cb(acb->common.opaque, acb->ret);
+		//qemu_free(acb->buf);
+		qemu_aio_release(acb);
+	}
+}
+
+static void rbd_readv_bh_cb(void *p)
+{
+        RBDAIOCB *acb = p;
+        RBDRVRBDState *s = acb->common.bs->opaque;
+	rados_completion_t c;
+	char *buf;
+	char n[RBD_MAX_SEG_NAME_SIZE];
+	int64_t segnr, segoffs, segsize;
+	int64_t off, size;
+
+	off = acb->sector_num * 512;
+	size = acb->nb_sectors * 512;
+	segnr = (int64_t) (off / s->objsize);
+	segoffs = (int64_t) (off % s->objsize);
+	segsize  = (int64_t) (s->objsize - segoffs);
+
+	buf = acb->buf;
+
+	while (size > 0) {
+		if (size < segsize) {
+			segsize = size;
+		}
+	
+		snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
+		acb->aiocnt++;
+		fprintf(stderr, "rbd_readv_bh_cb: aiocnt=%i: reading %s (segoffs: %lld, segsize %lld)\n", acb->aiocnt, n, (long long int) segoffs, (long long int) segsize);
+		rados_aio_read(s->pool, n, segoffs, (char *) buf, segsize, &c);
+		rados_aio_set_callback(c, (rados_callback_t) rbd_finish_aiocb, acb);
+		buf += segsize;
+		size -= segsize;
+		segoffs = 0;
+		segsize = s->objsize;
+		segnr++;
+	}
+
+	return;
+}
+
+static void rbd_writev_bh_cb(void *p)
+{
+        RBDAIOCB *acb = p;
+        RBDRVRBDState *s = acb->common.bs->opaque;
+	rados_completion_t c;
+	char *buf;
+	char n[RBD_MAX_SEG_NAME_SIZE];
+	int64_t segnr, segoffs, segsize;
+	int64_t off, size;
+
+	off = acb->sector_num * 512;
+	size = acb->nb_sectors * 512;
+	segnr = (int64_t) (off / s->objsize);
+	segoffs = (int64_t) (off % s->objsize);
+	segsize  = (int64_t) (s->objsize - segoffs);
+
+	buf = malloc(acb->qiov->size);
+	if (!buf) {
+		/* XXX: error */
+	}
+
+	qemu_iovec_to_buffer(acb->qiov, buf);
+
+	while (size > 0) {
+		if (size < segsize) {
+			segsize = size;
+		}
+	
+		snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name, (long long unsigned int) segnr);
+		acb->aiocnt++;
+		fprintf(stderr, "rbd_readv_bh_cb: aiocnt=%i: writing %s (segoffs: %lld, segsize %lld)\n", acb->aiocnt, n, (long long int) segoffs, (long long int) segsize);
+		rados_aio_write(s->pool, n, segoffs, (char *) buf, segsize, &c);
+		rados_aio_set_callback(c, (rados_callback_t) rbd_finish_aiocb, acb);
+		
+		buf += segsize;
+		size -= segsize;
+		segoffs = 0;
+		segsize = s->objsize;
+		segnr++;
+	}
+
+	return;
+}
+
+static void rbd_aio_cancel(BlockDriverAIOCB *blockacb) {
+	// Do we have to implement canceling? Seems to work without...
+}
+
+static AIOPool rbd_aio_pool = {
+	.aiocb_size         = sizeof(RBDAIOCB),
+	.cancel             = rbd_aio_cancel,
+};
+
+static RBDAIOCB *rbd_aio_setup(BlockDriverState *bs,
+                                     int write, QEMUIOVector *qiov,
+                                     int64_t sector_num, int nb_sectors,
+                                     BlockDriverCompletionFunc *cb,
+                                     void *opaque) {
+	RBDAIOCB *acb;
+
+	acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+	acb->qiov = qiov;
+	acb->sector_num = sector_num;
+	acb->nb_sectors = nb_sectors;
+	acb->write = write;
+	acb->bh = NULL;
+	acb->aiocnt = 0;
+
+	if(acb->buf) {
+		qemu_free(acb->buf);
+	}
+	acb->buf = qemu_malloc(nb_sectors * 512);
+	if(write == 0) {
+		memset(acb->buf, 0, nb_sectors * 512);
+	}
+
+	return(acb);
+}
+
+static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+	RBDAIOCB *acb;
+
+	acb = rbd_aio_setup(bs, 0, qiov, sector_num, nb_sectors, cb, opaque);
+
+	rbd_schedule_bh(rbd_readv_bh_cb, acb);
+	return &acb->common;
+}
+
+
+
+static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
+{
+        RBDAIOCB *acb;
+
+        acb = rbd_aio_setup(bs, 1, qiov, sector_num, nb_sectors, cb, opaque);
+
+        rbd_schedule_bh(rbd_writev_bh_cb, acb);
+        return &acb->common;
+}
+
+
 static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     RBDRVRBDState *s = bs->opaque;
@@ -381,6 +578,9 @@ static BlockDriver bdrv_rbd = {
 	.create_options = rbd_create_options,
 	.bdrv_getlength	= rbd_getlength,
 	.protocol_name	= "rbd",
+
+	.bdrv_aio_readv = rbd_aio_readv,
+	.bdrv_aio_writev= rbd_aio_writev,
 };
 
 static void bdrv_rbd_init(void) {
@@ -389,4 +589,3 @@ static void bdrv_rbd_init(void) {
 
 block_init(bdrv_rbd_init);
 
-

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: qemu-kvm/rbd: first attempt to implement aio
  2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
@ 2010-04-16  0:09   ` Yehuda Sadeh Weinraub
  2010-04-16 20:53     ` AW: " Christian Brunner
  0 siblings, 1 reply; 5+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-04-16  0:09 UTC (permalink / raw)
  To: Christian Brunner; +Cc: ceph-devel

On Thu, Apr 15, 2010 at 1:24 PM, Christian Brunner <chb@muc.de> wrote:
> To increase the performance of the qemu rbd driver I started to implement
> aio functions. Attached is an intial aio implementation. However it is
> missing the callback of rados_aio_read or _write from time to time.
> This leads to a lockup of the disk driver until the guest OS is
> perfomring a bus reset.
>
> If someone has an explanation for the cause of the lost callbacks. I
> would be happy.
>

Yeah, it looks like the rados aio interface is racy. You should be
able to first set aio completion and only then to do the read/write
and not the other way around as it is now. I've just pushed a fix to
the unstable branch that changes the interface a bit. Now you first
need to do a rados_aio_create_completion() first and use it in the
rados_aio_read()/write() call. Note that I haven't tested it yet, so
let me know if there's any problem.


Thanks,
Yehuda

^ permalink raw reply	[flat|nested] 5+ messages in thread

* AW: qemu-kvm/rbd: first attempt to implement aio
  2010-04-16  0:09   ` Yehuda Sadeh Weinraub
@ 2010-04-16 20:53     ` Christian Brunner
  0 siblings, 0 replies; 5+ messages in thread
From: Christian Brunner @ 2010-04-16 20:53 UTC (permalink / raw)
  To: Yehuda Sadeh Weinraub; +Cc: ceph-devel

> Yeah, it looks like the rados aio interface is racy. You should be
> able to first set aio completion and only then to do the read/write
> and not the other way around as it is now. I've just pushed a fix to
> the unstable branch that changes the interface a bit. Now you first
> need to do a rados_aio_create_completion() first and use it in the
> rados_aio_read()/write() call. Note that I haven't tested it yet, so
> let me know if there's any problem.
>
> Thanks,
> Yehuda

Thank you - the race condition seems to be fixed. 

I'm still experiencing some cases where the callback function is being
called a few hundred times, but I've to dig into this a little bit,
before I can tell about it.

Christian




^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-04-16 21:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-14 20:49 [PATCH] qemu-kvm/rbd: small fixes and cosmetics Christian Brunner
2010-04-14 21:12 ` Yehuda Sadeh Weinraub
2010-04-15 20:24 ` qemu-kvm/rbd: first attempt to implement aio Christian Brunner
2010-04-16  0:09   ` Yehuda Sadeh Weinraub
2010-04-16 20:53     ` AW: " Christian Brunner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.