All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rbd: convert to blk-mq
@ 2015-01-10 18:31 Christoph Hellwig
  2015-01-10 23:40 ` Alex Elder
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2015-01-10 18:31 UTC (permalink / raw)
  To: Yehuda Sadeh, Sage Weil, Alex Elder; +Cc: Alexandre DERUMIER, ceph-devel

This converts the rbd driver to use the blk-mq infrastructure.  Except
for switching to a per-request work item this is almost mechanical.

This was tested by Alexandre DERUMIER in November, and found to give
him 120000 iops, although the only comparism available was an old
3.10 kernel which gave 80000iops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/rbd.c | 118 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 51 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3ec85df..52cd677 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 #include <linux/device.h>
 #include <linux/module.h>
+#include <linux/blk-mq.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
@@ -342,7 +343,6 @@ struct rbd_device {
 
 	struct list_head	rq_queue;	/* incoming rq queue */
 	spinlock_t		lock;		/* queue, flags, open_count */
-	struct work_struct	rq_work;
 
 	struct rbd_image_header	header;
 	unsigned long		flags;		/* possibly lock protected */
@@ -360,6 +360,9 @@ struct rbd_device {
 	atomic_t		parent_ref;
 	struct rbd_device	*parent;
 
+	/* Block layer tags. */
+	struct blk_mq_tag_set	tag_set;
+
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
 
@@ -1817,7 +1820,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
-	 * passed to blk_end_request(), which takes an unsigned int.
+	 * passed to the block layer, which just supports a 32-bit
+	 * length field.
 	 */
 	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
@@ -2281,7 +2285,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
 		more = obj_request->which < img_request->obj_request_count - 1;
 	} else {
 		rbd_assert(img_request->rq != NULL);
-		more = blk_end_request(img_request->rq, result, xferred);
+	
+		more = blk_update_request(img_request->rq, result, xferred);
+		if (!more)
+			__blk_mq_end_request(img_request->rq, result);
 	}
 
 	return more;
@@ -3310,8 +3317,10 @@ out:
 	return ret;
 }
 
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
 {
+	struct request *rq = blk_mq_rq_from_pdu(work);
+	struct rbd_device *rbd_dev = rq->q->queuedata;
 	struct rbd_img_request *img_request;
 	struct ceph_snap_context *snapc = NULL;
 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
@@ -3319,6 +3328,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 	enum obj_operation_type op_type;
 	u64 mapping_size;
 	int result;
+		
+	if (rq->cmd_type != REQ_TYPE_FS) {
+		dout("%s: non-fs request type %d\n", __func__,
+			(int) rq->cmd_type);
+		result = -EIO;
+		goto err;
+	}
 
 	if (rq->cmd_flags & REQ_DISCARD)
 		op_type = OBJ_OP_DISCARD;
@@ -3358,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 		goto err_rq;
 	}
 
+	blk_mq_start_request(rq);
+
 	if (offset && length > U64_MAX - offset + 1) {
 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
 			 length);
@@ -3411,52 +3429,18 @@ err_rq:
 			 obj_op_name(op_type), length, offset, result);
 	ceph_put_snap_context(snapc);
 	blk_end_request_all(rq, result);
+err:
+	blk_mq_end_request(rq, result);
 }
 
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
 {
-	struct rbd_device *rbd_dev =
-	    container_of(work, struct rbd_device, rq_work);
-	struct request *rq, *next;
-	LIST_HEAD(requests);
-
-	spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
-	list_splice_init(&rbd_dev->rq_queue, &requests);
-	spin_unlock_irq(&rbd_dev->lock);
-
-	list_for_each_entry_safe(rq, next, &requests, queuelist) {
-		list_del_init(&rq->queuelist);
-		rbd_handle_request(rbd_dev, rq);
-	}
-}
+	struct request *rq = bd->rq;
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
-/*
- * Called with q->queue_lock held and interrupts disabled, possibly on
- * the way to schedule().  Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
-	struct rbd_device *rbd_dev = q->queuedata;
-	struct request *rq;
-	int queued = 0;
-
-	rbd_assert(rbd_dev);
-
-	while ((rq = blk_fetch_request(q))) {
-		/* Ignore any non-FS requests that filter through. */
-		if (rq->cmd_type != REQ_TYPE_FS) {
-			dout("%s: non-fs request type %d\n", __func__,
-				(int) rq->cmd_type);
-			__blk_end_request_all(rq, 0);
-			continue;
-		}
-
-		list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
-		queued++;
-	}
-
-	if (queued)
-		queue_work(rbd_wq, &rbd_dev->rq_work);
+	queue_work(rbd_wq, work);
+	return 0;
 }
 
 /*
@@ -3517,6 +3501,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 		del_gendisk(disk);
 		if (disk->queue)
 			blk_cleanup_queue(disk->queue);
+		blk_mq_free_tag_set(&rbd_dev->tag_set);
 	}
 	put_disk(disk);
 }
@@ -3728,11 +3713,28 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 	return 0;
 }
 
+static int rbd_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+	INIT_WORK(work, rbd_queue_workfn);
+	return 0;
+}
+
+static struct blk_mq_ops rbd_mq_ops = {
+	.queue_rq	= rbd_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= rbd_init_request,
+};
+
 static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
 	u64 segment_size;
+	int err;
 
 	/* create gendisk info */
 	disk = alloc_disk(single_major ?
@@ -3750,10 +3752,24 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 
-	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
-	if (!q)
+	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+	rbd_dev->tag_set.ops = &rbd_mq_ops;
+	rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+	rbd_dev->tag_set.flags =
+		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	rbd_dev->tag_set.nr_hw_queues = 1;
+	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+	if (err)
 		goto out_disk;
 
+	err = -ENOMEM;
+	q = blk_mq_init_queue(&rbd_dev->tag_set);
+	if (!q)
+		goto out_tag_set;
+
 	/* We use the default size, but let's be explicit about it. */
 	blk_queue_physical_block_size(q, SECTOR_SIZE);
 
@@ -3779,10 +3795,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->disk = disk;
 
 	return 0;
+out_tag_set:
+	blk_mq_free_tag_set(&rbd_dev->tag_set);
 out_disk:
 	put_disk(disk);
-
-	return -ENOMEM;
+	return err;
 }
 
 /*
@@ -4040,7 +4057,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
 	spin_lock_init(&rbd_dev->lock);
 	INIT_LIST_HEAD(&rbd_dev->rq_queue);
-	INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
 	rbd_dev->flags = 0;
 	atomic_set(&rbd_dev->parent_ref, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] rbd: convert to blk-mq
  2015-01-10 18:31 [PATCH] rbd: convert to blk-mq Christoph Hellwig
@ 2015-01-10 23:40 ` Alex Elder
  2015-01-12 12:40   ` [PATCH v2] " Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Elder @ 2015-01-10 23:40 UTC (permalink / raw)
  To: Christoph Hellwig, Yehuda Sadeh, Sage Weil, Alex Elder
  Cc: Alexandre DERUMIER, ceph-devel

On 01/10/2015 12:31 PM, Christoph Hellwig wrote:
> This converts the rbd driver to use the blk-mq infrastructure.  Except
> for switching to a per-request work item this is almost mechanical.
> 
> This was tested by Alexandre DERUMIER in November, and found to give
> him 120000 iops, although the only comparism available was an old
> 3.10 kernel which gave 80000iops.

I'm coming up to speed with the blk-mq stuff only now.  It looks
like requests are sent to the driver via ->queue_rq() rather than
the driver taking them via blk_fetch_request(q).

Previously we would pull as many requests as were available, put
them on the device's request queue, and then activate the rbd
workqueue to handle them one-by-one using rbd_handle_request().

Now, the rbd queue_rq method rbd_request_workfn() adds the request
to the rbd workqueue directly.  The work_struct implicitly follows
the request structure (which is set up by the blk-mq code).  We
have to do the REQ_TYPE_FS check at the time it's queued now,
rather than when it's fetched from the queue.  And finally we now
have to tell the blk-mq subsystem when we've started and ended a
request.

I didn't follow up on all the tag_set initialization values
so I assume you got that right (it looks reasonable to me).

Given the above, it looks like everything else should work
about the same as before, we're just handed requests rather
than asking for them.

With this patch applied, rbd_device->rq_queue is no longer
needed so you should delete it.  I got two warnings about
endo-of-line whitespace in your patch.  And I have one other
very small suggestion below.

Other than those things, this looks great to me.

Reviewed-by: Alex Elder <elder@linaro.org>

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/rbd.c | 118 +++++++++++++++++++++++++++++-----------------------
>  1 file changed, 67 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 3ec85df..52cd677 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c

. . .

(The following is in the new rbd_queue_rq().)

> +	queue_work(rbd_wq, work);
> +	return 0;

	return BLK_MQ_RQ_QUEUE_OK;

(Because the symbolic values are explicitly checked
by the caller.)

>  }
>  
>  /*

. . .


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] rbd: convert to blk-mq
  2015-01-10 23:40 ` Alex Elder
@ 2015-01-12 12:40   ` Christoph Hellwig
       [not found]     ` <1682521695.3794355.1421069570592.JavaMail.zimbra@oxygem.tv>
  2015-01-12 17:10     ` Ilya Dryomov
  0 siblings, 2 replies; 7+ messages in thread
From: Christoph Hellwig @ 2015-01-12 12:40 UTC (permalink / raw)
  To: Alex Elder
  Cc: Yehuda Sadeh, Sage Weil, Alex Elder, Alexandre DERUMIER, ceph-devel

This converts the rbd driver to use the blk-mq infrastructure.  Except
for switching to a per-request work item this is almost mechanical.

This was tested by Alexandre DERUMIER in November, and found to give
him 120000 iops, although the only comparism available was an old
3.10 kernel which gave 80000iops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 120 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 53 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3ec85df..c64a798 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 #include <linux/device.h>
 #include <linux/module.h>
+#include <linux/blk-mq.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
@@ -340,9 +341,7 @@ struct rbd_device {
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 
-	struct list_head	rq_queue;	/* incoming rq queue */
 	spinlock_t		lock;		/* queue, flags, open_count */
-	struct work_struct	rq_work;
 
 	struct rbd_image_header	header;
 	unsigned long		flags;		/* possibly lock protected */
@@ -360,6 +359,9 @@ struct rbd_device {
 	atomic_t		parent_ref;
 	struct rbd_device	*parent;
 
+	/* Block layer tags. */
+	struct blk_mq_tag_set	tag_set;
+
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
 
@@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
-	 * passed to blk_end_request(), which takes an unsigned int.
+	 * passed to the block layer, which just supports a 32-bit
+	 * length field.
 	 */
 	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
@@ -2281,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
 		more = obj_request->which < img_request->obj_request_count - 1;
 	} else {
 		rbd_assert(img_request->rq != NULL);
-		more = blk_end_request(img_request->rq, result, xferred);
+
+		more = blk_update_request(img_request->rq, result, xferred);
+		if (!more)
+			__blk_mq_end_request(img_request->rq, result);
 	}
 
 	return more;
@@ -3310,8 +3316,10 @@ out:
 	return ret;
 }
 
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
 {
+	struct request *rq = blk_mq_rq_from_pdu(work);
+	struct rbd_device *rbd_dev = rq->q->queuedata;
 	struct rbd_img_request *img_request;
 	struct ceph_snap_context *snapc = NULL;
 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
@@ -3320,6 +3328,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 	u64 mapping_size;
 	int result;
 
+	if (rq->cmd_type != REQ_TYPE_FS) {
+		dout("%s: non-fs request type %d\n", __func__,
+			(int) rq->cmd_type);
+		result = -EIO;
+		goto err;
+	}
+
 	if (rq->cmd_flags & REQ_DISCARD)
 		op_type = OBJ_OP_DISCARD;
 	else if (rq->cmd_flags & REQ_WRITE)
@@ -3358,6 +3373,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 		goto err_rq;
 	}
 
+	blk_mq_start_request(rq);
+
 	if (offset && length > U64_MAX - offset + 1) {
 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
 			 length);
@@ -3411,52 +3428,18 @@ err_rq:
 			 obj_op_name(op_type), length, offset, result);
 	ceph_put_snap_context(snapc);
 	blk_end_request_all(rq, result);
+err:
+	blk_mq_end_request(rq, result);
 }
 
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
 {
-	struct rbd_device *rbd_dev =
-	    container_of(work, struct rbd_device, rq_work);
-	struct request *rq, *next;
-	LIST_HEAD(requests);
-
-	spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
-	list_splice_init(&rbd_dev->rq_queue, &requests);
-	spin_unlock_irq(&rbd_dev->lock);
-
-	list_for_each_entry_safe(rq, next, &requests, queuelist) {
-		list_del_init(&rq->queuelist);
-		rbd_handle_request(rbd_dev, rq);
-	}
-}
-
-/*
- * Called with q->queue_lock held and interrupts disabled, possibly on
- * the way to schedule().  Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
-	struct rbd_device *rbd_dev = q->queuedata;
-	struct request *rq;
-	int queued = 0;
-
-	rbd_assert(rbd_dev);
-
-	while ((rq = blk_fetch_request(q))) {
-		/* Ignore any non-FS requests that filter through. */
-		if (rq->cmd_type != REQ_TYPE_FS) {
-			dout("%s: non-fs request type %d\n", __func__,
-				(int) rq->cmd_type);
-			__blk_end_request_all(rq, 0);
-			continue;
-		}
-
-		list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
-		queued++;
-	}
+	struct request *rq = bd->rq;
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
-	if (queued)
-		queue_work(rbd_wq, &rbd_dev->rq_work);
+	queue_work(rbd_wq, work);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 /*
@@ -3517,6 +3500,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 		del_gendisk(disk);
 		if (disk->queue)
 			blk_cleanup_queue(disk->queue);
+		blk_mq_free_tag_set(&rbd_dev->tag_set);
 	}
 	put_disk(disk);
 }
@@ -3728,11 +3712,28 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 	return 0;
 }
 
+static int rbd_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+	INIT_WORK(work, rbd_queue_workfn);
+	return 0;
+}
+
+static struct blk_mq_ops rbd_mq_ops = {
+	.queue_rq	= rbd_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= rbd_init_request,
+};
+
 static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
 	u64 segment_size;
+	int err;
 
 	/* create gendisk info */
 	disk = alloc_disk(single_major ?
@@ -3750,10 +3751,24 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 
-	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
-	if (!q)
+	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+	rbd_dev->tag_set.ops = &rbd_mq_ops;
+	rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+	rbd_dev->tag_set.flags =
+		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	rbd_dev->tag_set.nr_hw_queues = 1;
+	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+	if (err)
 		goto out_disk;
 
+	err = -ENOMEM;
+	q = blk_mq_init_queue(&rbd_dev->tag_set);
+	if (!q)
+		goto out_tag_set;
+
 	/* We use the default size, but let's be explicit about it. */
 	blk_queue_physical_block_size(q, SECTOR_SIZE);
 
@@ -3779,10 +3794,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->disk = disk;
 
 	return 0;
+out_tag_set:
+	blk_mq_free_tag_set(&rbd_dev->tag_set);
 out_disk:
 	put_disk(disk);
-
-	return -ENOMEM;
+	return err;
 }
 
 /*
@@ -4039,8 +4055,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
-	INIT_LIST_HEAD(&rbd_dev->rq_queue);
-	INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
 	rbd_dev->flags = 0;
 	atomic_set(&rbd_dev->parent_ref, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] rbd: convert to blk-mq
       [not found]     ` <1682521695.3794355.1421069570592.JavaMail.zimbra@oxygem.tv>
@ 2015-01-12 13:32       ` Alexandre DERUMIER
  0 siblings, 0 replies; 7+ messages in thread
From: Alexandre DERUMIER @ 2015-01-12 13:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Elder, Yehuda Sadeh, Sage Weil, Alex Elder, ceph-devel

Hi Christoph,

I'll have my production cluster ready around next month,

with a lot more powerfull nodes (each node : 2x10 cores 3,1ghz + 6 ssd intel s3500).

I'll redo benchmark and I post results as soon as possible.


----- Mail original -----
De: "Christoph Hellwig" <hch@lst.de>
À: "Alex Elder" <elder@ieee.org>
Cc: "Yehuda Sadeh" <yehuda@inktank.com>, "Sage Weil" <sage@inktank.com>, "Alex Elder" <elder@kernel.org>, "aderumier" <aderumier@odiso.com>, "ceph-devel" <ceph-devel@vger.kernel.org>
Envoyé: Lundi 12 Janvier 2015 13:40:02
Objet: [PATCH v2] rbd: convert to blk-mq

This converts the rbd driver to use the blk-mq infrastructure. Except 
for switching to a per-request work item this is almost mechanical. 

This was tested by Alexandre DERUMIER in November, and found to give 
him 120000 iops, although the only comparism available was an old 
3.10 kernel which gave 80000iops. 

Signed-off-by: Christoph Hellwig <hch@lst.de> 
Reviewed-by: Alex Elder <elder@linaro.org> 
--- 
drivers/block/rbd.c | 120 +++++++++++++++++++++++++++++----------------------- 
1 file changed, 67 insertions(+), 53 deletions(-) 

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c 
index 3ec85df..c64a798 100644 
--- a/drivers/block/rbd.c 
+++ b/drivers/block/rbd.c 
@@ -38,6 +38,7 @@ 
#include <linux/kernel.h> 
#include <linux/device.h> 
#include <linux/module.h> 
+#include <linux/blk-mq.h> 
#include <linux/fs.h> 
#include <linux/blkdev.h> 
#include <linux/slab.h> 
@@ -340,9 +341,7 @@ struct rbd_device { 

char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 

- struct list_head rq_queue; /* incoming rq queue */ 
spinlock_t lock; /* queue, flags, open_count */ 
- struct work_struct rq_work; 

struct rbd_image_header header; 
unsigned long flags; /* possibly lock protected */ 
@@ -360,6 +359,9 @@ struct rbd_device { 
atomic_t parent_ref; 
struct rbd_device *parent; 

+ /* Block layer tags. */ 
+ struct blk_mq_tag_set tag_set; 
+ 
/* protects updating the header */ 
struct rw_semaphore header_rwsem; 

@@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 

/* 
* We support a 64-bit length, but ultimately it has to be 
- * passed to blk_end_request(), which takes an unsigned int. 
+ * passed to the block layer, which just supports a 32-bit 
+ * length field. 
*/ 
obj_request->xferred = osd_req->r_reply_op_len[0]; 
rbd_assert(obj_request->xferred < (u64)UINT_MAX); 
@@ -2281,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 
more = obj_request->which < img_request->obj_request_count - 1; 
} else { 
rbd_assert(img_request->rq != NULL); 
- more = blk_end_request(img_request->rq, result, xferred); 
+ 
+ more = blk_update_request(img_request->rq, result, xferred); 
+ if (!more) 
+ __blk_mq_end_request(img_request->rq, result); 
} 

return more; 
@@ -3310,8 +3316,10 @@ out: 
return ret; 
} 

-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 
+static void rbd_queue_workfn(struct work_struct *work) 
{ 
+ struct request *rq = blk_mq_rq_from_pdu(work); 
+ struct rbd_device *rbd_dev = rq->q->queuedata; 
struct rbd_img_request *img_request; 
struct ceph_snap_context *snapc = NULL; 
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 
@@ -3320,6 +3328,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 
u64 mapping_size; 
int result; 

+ if (rq->cmd_type != REQ_TYPE_FS) { 
+ dout("%s: non-fs request type %d\n", __func__, 
+ (int) rq->cmd_type); 
+ result = -EIO; 
+ goto err; 
+ } 
+ 
if (rq->cmd_flags & REQ_DISCARD) 
op_type = OBJ_OP_DISCARD; 
else if (rq->cmd_flags & REQ_WRITE) 
@@ -3358,6 +3373,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 
goto err_rq; 
} 

+ blk_mq_start_request(rq); 
+ 
if (offset && length > U64_MAX - offset + 1) { 
rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 
length); 
@@ -3411,52 +3428,18 @@ err_rq: 
obj_op_name(op_type), length, offset, result); 
ceph_put_snap_context(snapc); 
blk_end_request_all(rq, result); 
+err: 
+ blk_mq_end_request(rq, result); 
} 

-static void rbd_request_workfn(struct work_struct *work) 
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 
+ const struct blk_mq_queue_data *bd) 
{ 
- struct rbd_device *rbd_dev = 
- container_of(work, struct rbd_device, rq_work); 
- struct request *rq, *next; 
- LIST_HEAD(requests); 
- 
- spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 
- list_splice_init(&rbd_dev->rq_queue, &requests); 
- spin_unlock_irq(&rbd_dev->lock); 
- 
- list_for_each_entry_safe(rq, next, &requests, queuelist) { 
- list_del_init(&rq->queuelist); 
- rbd_handle_request(rbd_dev, rq); 
- } 
-} 
- 
-/* 
- * Called with q->queue_lock held and interrupts disabled, possibly on 
- * the way to schedule(). Do not sleep here! 
- */ 
-static void rbd_request_fn(struct request_queue *q) 
-{ 
- struct rbd_device *rbd_dev = q->queuedata; 
- struct request *rq; 
- int queued = 0; 
- 
- rbd_assert(rbd_dev); 
- 
- while ((rq = blk_fetch_request(q))) { 
- /* Ignore any non-FS requests that filter through. */ 
- if (rq->cmd_type != REQ_TYPE_FS) { 
- dout("%s: non-fs request type %d\n", __func__, 
- (int) rq->cmd_type); 
- __blk_end_request_all(rq, 0); 
- continue; 
- } 
- 
- list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 
- queued++; 
- } 
+ struct request *rq = bd->rq; 
+ struct work_struct *work = blk_mq_rq_to_pdu(rq); 

- if (queued) 
- queue_work(rbd_wq, &rbd_dev->rq_work); 
+ queue_work(rbd_wq, work); 
+ return BLK_MQ_RQ_QUEUE_OK; 
} 

/* 
@@ -3517,6 +3500,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) 
del_gendisk(disk); 
if (disk->queue) 
blk_cleanup_queue(disk->queue); 
+ blk_mq_free_tag_set(&rbd_dev->tag_set); 
} 
put_disk(disk); 
} 
@@ -3728,11 +3712,28 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) 
return 0; 
} 

+static int rbd_init_request(void *data, struct request *rq, 
+ unsigned int hctx_idx, unsigned int request_idx, 
+ unsigned int numa_node) 
+{ 
+ struct work_struct *work = blk_mq_rq_to_pdu(rq); 
+ 
+ INIT_WORK(work, rbd_queue_workfn); 
+ return 0; 
+} 
+ 
+static struct blk_mq_ops rbd_mq_ops = { 
+ .queue_rq = rbd_queue_rq, 
+ .map_queue = blk_mq_map_queue, 
+ .init_request = rbd_init_request, 
+}; 
+ 
static int rbd_init_disk(struct rbd_device *rbd_dev) 
{ 
struct gendisk *disk; 
struct request_queue *q; 
u64 segment_size; 
+ int err; 

/* create gendisk info */ 
disk = alloc_disk(single_major ? 
@@ -3750,10 +3751,24 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) 
disk->fops = &rbd_bd_ops; 
disk->private_data = rbd_dev; 

- q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 
- if (!q) 
+ memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 
+ rbd_dev->tag_set.ops = &rbd_mq_ops; 
+ rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; 
+ rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 
+ rbd_dev->tag_set.flags = 
+ BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 
+ rbd_dev->tag_set.nr_hw_queues = 1; 
+ rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 
+ 
+ err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 
+ if (err) 
goto out_disk; 

+ err = -ENOMEM; 
+ q = blk_mq_init_queue(&rbd_dev->tag_set); 
+ if (!q) 
+ goto out_tag_set; 
+ 
/* We use the default size, but let's be explicit about it. */ 
blk_queue_physical_block_size(q, SECTOR_SIZE); 

@@ -3779,10 +3794,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) 
rbd_dev->disk = disk; 

return 0; 
+out_tag_set: 
+ blk_mq_free_tag_set(&rbd_dev->tag_set); 
out_disk: 
put_disk(disk); 
- 
- return -ENOMEM; 
+ return err; 
} 

/* 
@@ -4039,8 +4055,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 
return NULL; 

spin_lock_init(&rbd_dev->lock); 
- INIT_LIST_HEAD(&rbd_dev->rq_queue); 
- INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 
rbd_dev->flags = 0; 
atomic_set(&rbd_dev->parent_ref, 0); 
INIT_LIST_HEAD(&rbd_dev->node); 
-- 
1.9.1 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] rbd: convert to blk-mq
  2015-01-12 12:40   ` [PATCH v2] " Christoph Hellwig
       [not found]     ` <1682521695.3794355.1421069570592.JavaMail.zimbra@oxygem.tv>
@ 2015-01-12 17:10     ` Ilya Dryomov
  2015-01-13 16:19       ` Christoph Hellwig
  2015-01-13 16:20       ` [PATCH v3] " Christoph Hellwig
  1 sibling, 2 replies; 7+ messages in thread
From: Ilya Dryomov @ 2015-01-12 17:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Elder, Yehuda Sadeh, Sage Weil, Alex Elder,
	Alexandre DERUMIER, Ceph Development

On Mon, Jan 12, 2015 at 3:40 PM, Christoph Hellwig <hch@lst.de> wrote:
> This converts the rbd driver to use the blk-mq infrastructure.  Except
> for switching to a per-request work item this is almost mechanical.

Hi Christoph,

I too am not up to speed on blk-mq but still have a couple sanity
questions/comments below.

>
> This was tested by Alexandre DERUMIER in November, and found to give
> him 120000 iops, although the only comparism available was an old
> 3.10 kernel which gave 80000iops.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Alex Elder <elder@linaro.org>
> ---
>  drivers/block/rbd.c | 120 +++++++++++++++++++++++++++++-----------------------
>  1 file changed, 67 insertions(+), 53 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 3ec85df..c64a798 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -38,6 +38,7 @@
>  #include <linux/kernel.h>
>  #include <linux/device.h>
>  #include <linux/module.h>
> +#include <linux/blk-mq.h>
>  #include <linux/fs.h>
>  #include <linux/blkdev.h>
>  #include <linux/slab.h>
> @@ -340,9 +341,7 @@ struct rbd_device {
>
>         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
>
> -       struct list_head        rq_queue;       /* incoming rq queue */
>         spinlock_t              lock;           /* queue, flags, open_count */
> -       struct work_struct      rq_work;
>
>         struct rbd_image_header header;
>         unsigned long           flags;          /* possibly lock protected */
> @@ -360,6 +359,9 @@ struct rbd_device {
>         atomic_t                parent_ref;
>         struct rbd_device       *parent;
>
> +       /* Block layer tags. */
> +       struct blk_mq_tag_set   tag_set;
> +
>         /* protects updating the header */
>         struct rw_semaphore     header_rwsem;
>
> @@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
>
>         /*
>          * We support a 64-bit length, but ultimately it has to be
> -        * passed to blk_end_request(), which takes an unsigned int.
> +        * passed to the block layer, which just supports a 32-bit
> +        * length field.
>          */
>         obj_request->xferred = osd_req->r_reply_op_len[0];
>         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
> @@ -2281,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
>                 more = obj_request->which < img_request->obj_request_count - 1;
>         } else {
>                 rbd_assert(img_request->rq != NULL);
> -               more = blk_end_request(img_request->rq, result, xferred);
> +
> +               more = blk_update_request(img_request->rq, result, xferred);
> +               if (!more)
> +                       __blk_mq_end_request(img_request->rq, result);
>         }
>
>         return more;
> @@ -3310,8 +3316,10 @@ out:
>         return ret;
>  }
>
> -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
> +static void rbd_queue_workfn(struct work_struct *work)
>  {
> +       struct request *rq = blk_mq_rq_from_pdu(work);
> +       struct rbd_device *rbd_dev = rq->q->queuedata;
>         struct rbd_img_request *img_request;
>         struct ceph_snap_context *snapc = NULL;
>         u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
> @@ -3320,6 +3328,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
>         u64 mapping_size;
>         int result;
>
> +       if (rq->cmd_type != REQ_TYPE_FS) {
> +               dout("%s: non-fs request type %d\n", __func__,
> +                       (int) rq->cmd_type);
> +               result = -EIO;
> +               goto err;
> +       }

So in !REQ_TYPE_FS case we would error out with blk_mq_end_request(),
while other (probably silly, but still) checks error out to err_rq
label.  Any particular reason for this?


> +
>         if (rq->cmd_flags & REQ_DISCARD)
>                 op_type = OBJ_OP_DISCARD;
>         else if (rq->cmd_flags & REQ_WRITE)
> @@ -3358,6 +3373,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
>                 goto err_rq;
>         }
>
> +       blk_mq_start_request(rq);

Why is this call here?  Why not above or below?  I doubt it makes much
difference, but from a clarity standpoint at least, shouldn't it be
placed after all the checks and allocations, say before the call to
rbd_img_request_submit()?

> +
>         if (offset && length > U64_MAX - offset + 1) {
>                 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
>                          length);
> @@ -3411,52 +3428,18 @@ err_rq:
>                          obj_op_name(op_type), length, offset, result);
>         ceph_put_snap_context(snapc);
>         blk_end_request_all(rq, result);
> +err:
> +       blk_mq_end_request(rq, result);

Expanding on REQ_TYPE_FS comment, isn't blk_mq_end_request() enough?
Swap blk_end_request_all() for blk_mq_end_request() and get rid of err
label?

>  }
>
> -static void rbd_request_workfn(struct work_struct *work)
> +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
> +               const struct blk_mq_queue_data *bd)
>  {
> -       struct rbd_device *rbd_dev =
> -           container_of(work, struct rbd_device, rq_work);
> -       struct request *rq, *next;
> -       LIST_HEAD(requests);
> -
> -       spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
> -       list_splice_init(&rbd_dev->rq_queue, &requests);
> -       spin_unlock_irq(&rbd_dev->lock);
> -
> -       list_for_each_entry_safe(rq, next, &requests, queuelist) {
> -               list_del_init(&rq->queuelist);
> -               rbd_handle_request(rbd_dev, rq);
> -       }
> -}
> -
> -/*
> - * Called with q->queue_lock held and interrupts disabled, possibly on
> - * the way to schedule().  Do not sleep here!
> - */
> -static void rbd_request_fn(struct request_queue *q)
> -{
> -       struct rbd_device *rbd_dev = q->queuedata;
> -       struct request *rq;
> -       int queued = 0;
> -
> -       rbd_assert(rbd_dev);
> -
> -       while ((rq = blk_fetch_request(q))) {
> -               /* Ignore any non-FS requests that filter through. */
> -               if (rq->cmd_type != REQ_TYPE_FS) {
> -                       dout("%s: non-fs request type %d\n", __func__,
> -                               (int) rq->cmd_type);
> -                       __blk_end_request_all(rq, 0);
> -                       continue;
> -               }
> -
> -               list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
> -               queued++;
> -       }
> +       struct request *rq = bd->rq;
> +       struct work_struct *work = blk_mq_rq_to_pdu(rq);
>
> -       if (queued)
> -               queue_work(rbd_wq, &rbd_dev->rq_work);
> +       queue_work(rbd_wq, work);

Finally, if we are switching to blk-mq, is there any way to avoid the
"re-queueing to internal rbd_wq" dance?  I introduced it to avoid
blocking in request_fn(), but I remember you mentioning that we should
be able to get rid of it with the switch to blk-mq.  Is "blk-mq: allow
direct dispatch to a driver specific workqueue" related to this?

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] rbd: convert to blk-mq
  2015-01-12 17:10     ` Ilya Dryomov
@ 2015-01-13 16:19       ` Christoph Hellwig
  2015-01-13 16:20       ` [PATCH v3] " Christoph Hellwig
  1 sibling, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2015-01-13 16:19 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: Alex Elder, Yehuda Sadeh, Sage Weil, Alex Elder,
	Alexandre DERUMIER, Ceph Development

On Mon, Jan 12, 2015 at 08:10:48PM +0300, Ilya Dryomov wrote:
> Why is this call here?  Why not above or below?  I doubt it makes much
> difference, but from a clarity standpoint at least, shouldn't it be
> placed after all the checks and allocations, say before the call to
> rbd_img_request_submit()?

The idea is to do it before doing real work, but after the request
is set up far enough that a cancallation works.  For rbd that doesn't do
timeouts or cancellations it really doesn't matter too much.  I've
moved it a little further down after the next trivial check now.

> Expanding on REQ_TYPE_FS comment, isn't blk_mq_end_request() enough?
> Swap blk_end_request_all() for blk_mq_end_request() and get rid of err
> label?

The blk_end_request_all should be gone and sneaked back in due to a sloppy
rebase.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3] rbd: convert to blk-mq
  2015-01-12 17:10     ` Ilya Dryomov
  2015-01-13 16:19       ` Christoph Hellwig
@ 2015-01-13 16:20       ` Christoph Hellwig
  1 sibling, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2015-01-13 16:20 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: Alex Elder, Yehuda Sadeh, Sage Weil, Alex Elder,
	Alexandre DERUMIER, Ceph Development

This converts the rbd driver to use the blk-mq infrastructure.  Except
for switching to a per-request work item this is almost mechanical.

This was tested by Alexandre DERUMIER in November, and found to give
him 120000 iops, although the only comparism available was an old
3.10 kernel which gave 80000iops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 121 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 54 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3ec85df..b5f0cd3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 #include <linux/device.h>
 #include <linux/module.h>
+#include <linux/blk-mq.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
@@ -340,9 +341,7 @@ struct rbd_device {
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 
-	struct list_head	rq_queue;	/* incoming rq queue */
 	spinlock_t		lock;		/* queue, flags, open_count */
-	struct work_struct	rq_work;
 
 	struct rbd_image_header	header;
 	unsigned long		flags;		/* possibly lock protected */
@@ -360,6 +359,9 @@ struct rbd_device {
 	atomic_t		parent_ref;
 	struct rbd_device	*parent;
 
+	/* Block layer tags. */
+	struct blk_mq_tag_set	tag_set;
+
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
 
@@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
-	 * passed to blk_end_request(), which takes an unsigned int.
+	 * passed to the block layer, which just supports a 32-bit
+	 * length field.
 	 */
 	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
@@ -2281,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
 		more = obj_request->which < img_request->obj_request_count - 1;
 	} else {
 		rbd_assert(img_request->rq != NULL);
-		more = blk_end_request(img_request->rq, result, xferred);
+
+		more = blk_update_request(img_request->rq, result, xferred);
+		if (!more)
+			__blk_mq_end_request(img_request->rq, result);
 	}
 
 	return more;
@@ -3310,8 +3316,10 @@ out:
 	return ret;
 }
 
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
 {
+	struct request *rq = blk_mq_rq_from_pdu(work);
+	struct rbd_device *rbd_dev = rq->q->queuedata;
 	struct rbd_img_request *img_request;
 	struct ceph_snap_context *snapc = NULL;
 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
@@ -3320,6 +3328,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 	u64 mapping_size;
 	int result;
 
+	if (rq->cmd_type != REQ_TYPE_FS) {
+		dout("%s: non-fs request type %d\n", __func__,
+			(int) rq->cmd_type);
+		result = -EIO;
+		goto err;
+	}
+
 	if (rq->cmd_flags & REQ_DISCARD)
 		op_type = OBJ_OP_DISCARD;
 	else if (rq->cmd_flags & REQ_WRITE)
@@ -3365,6 +3380,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 		goto err_rq;	/* Shouldn't happen */
 	}
 
+	blk_mq_start_request(rq);
+
 	down_read(&rbd_dev->header_rwsem);
 	mapping_size = rbd_dev->mapping.size;
 	if (op_type != OBJ_OP_READ) {
@@ -3410,53 +3427,18 @@ err_rq:
 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
 			 obj_op_name(op_type), length, offset, result);
 	ceph_put_snap_context(snapc);
-	blk_end_request_all(rq, result);
+err:
+	blk_mq_end_request(rq, result);
 }
 
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
 {
-	struct rbd_device *rbd_dev =
-	    container_of(work, struct rbd_device, rq_work);
-	struct request *rq, *next;
-	LIST_HEAD(requests);
-
-	spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
-	list_splice_init(&rbd_dev->rq_queue, &requests);
-	spin_unlock_irq(&rbd_dev->lock);
-
-	list_for_each_entry_safe(rq, next, &requests, queuelist) {
-		list_del_init(&rq->queuelist);
-		rbd_handle_request(rbd_dev, rq);
-	}
-}
-
-/*
- * Called with q->queue_lock held and interrupts disabled, possibly on
- * the way to schedule().  Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
-	struct rbd_device *rbd_dev = q->queuedata;
-	struct request *rq;
-	int queued = 0;
-
-	rbd_assert(rbd_dev);
-
-	while ((rq = blk_fetch_request(q))) {
-		/* Ignore any non-FS requests that filter through. */
-		if (rq->cmd_type != REQ_TYPE_FS) {
-			dout("%s: non-fs request type %d\n", __func__,
-				(int) rq->cmd_type);
-			__blk_end_request_all(rq, 0);
-			continue;
-		}
-
-		list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
-		queued++;
-	}
+	struct request *rq = bd->rq;
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
-	if (queued)
-		queue_work(rbd_wq, &rbd_dev->rq_work);
+	queue_work(rbd_wq, work);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 /*
@@ -3517,6 +3499,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 		del_gendisk(disk);
 		if (disk->queue)
 			blk_cleanup_queue(disk->queue);
+		blk_mq_free_tag_set(&rbd_dev->tag_set);
 	}
 	put_disk(disk);
 }
@@ -3728,11 +3711,28 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 	return 0;
 }
 
+static int rbd_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+	INIT_WORK(work, rbd_queue_workfn);
+	return 0;
+}
+
+static struct blk_mq_ops rbd_mq_ops = {
+	.queue_rq	= rbd_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= rbd_init_request,
+};
+
 static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
 	u64 segment_size;
+	int err;
 
 	/* create gendisk info */
 	disk = alloc_disk(single_major ?
@@ -3750,10 +3750,24 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 
-	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
-	if (!q)
+	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+	rbd_dev->tag_set.ops = &rbd_mq_ops;
+	rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+	rbd_dev->tag_set.flags =
+		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	rbd_dev->tag_set.nr_hw_queues = 1;
+	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+	if (err)
 		goto out_disk;
 
+	err = -ENOMEM;
+	q = blk_mq_init_queue(&rbd_dev->tag_set);
+	if (!q)
+		goto out_tag_set;
+
 	/* We use the default size, but let's be explicit about it. */
 	blk_queue_physical_block_size(q, SECTOR_SIZE);
 
@@ -3779,10 +3793,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->disk = disk;
 
 	return 0;
+out_tag_set:
+	blk_mq_free_tag_set(&rbd_dev->tag_set);
 out_disk:
 	put_disk(disk);
-
-	return -ENOMEM;
+	return err;
 }
 
 /*
@@ -4039,8 +4054,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
-	INIT_LIST_HEAD(&rbd_dev->rq_queue);
-	INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
 	rbd_dev->flags = 0;
 	atomic_set(&rbd_dev->parent_ref, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-01-13 16:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-10 18:31 [PATCH] rbd: convert to blk-mq Christoph Hellwig
2015-01-10 23:40 ` Alex Elder
2015-01-12 12:40   ` [PATCH v2] " Christoph Hellwig
     [not found]     ` <1682521695.3794355.1421069570592.JavaMail.zimbra@oxygem.tv>
2015-01-12 13:32       ` Alexandre DERUMIER
2015-01-12 17:10     ` Ilya Dryomov
2015-01-13 16:19       ` Christoph Hellwig
2015-01-13 16:20       ` [PATCH v3] " Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.