All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
To: neilb@suse.de
Cc: linux-raid@vger.kernel.org,
	Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>,
	Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Subject: [RFC PATCH 2/4] md: introduce request function mode support
Date: Wed,  4 Jun 2014 19:10:00 +0200	[thread overview]
Message-ID: <1401901802-16296-3-git-send-email-sebastian.riemer@profitbricks.com> (raw)
In-Reply-To: <1401901802-16296-1-git-send-email-sebastian.riemer@profitbricks.com>

From: Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>

This introduces the writable module parameter 'rq_mode' which is
used to set the I/O mode for all subsequently created MD devices.
Set it to 0 for the default mode (the make request function mode)
in order to process I/O bio-by-bio or set it to 1 for the new
request function mode to process I/O request-by-request. Common
code is shared between both modes.

The advantage of the new mode is that a scheduler can be used and
the block layer cares for I/O statistics.

Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>
[spars: merged commits, changed description, fixed checkpatch warnings]
Signed-off-by: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
---
 drivers/md/md.c |  280 +++++++++++++++++++++++++++++++++++++++++++++++++------
 drivers/md/md.h |    7 ++
 2 files changed, 257 insertions(+), 30 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8c653f9..0e5c420 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -56,8 +56,6 @@
 
 #ifdef BIO_ACCOUNTING_EXTENSION
 
-#include <linux/ratelimit.h>
-
 struct md_bio_private {
 	void		(*orig_bio_endio)(struct bio *, int);
 	void		*orig_bio_private;
@@ -68,6 +66,30 @@ struct md_bio_private {
 
 static struct kmem_cache *md_bio_private_cache __read_mostly;
 
+#endif	/* BIO_ACCOUNTING_EXTENSION */
+
+#ifdef MD_REQUEST_FUNCTION
+
+struct md_request_clone {
+	struct work_struct work;
+	struct mddev	*mdp;
+	struct request	*req;
+	struct bio_list	bios;
+	atomic_t	cnt;
+	int		err;
+};
+
+#define MD_RQ_MODE_DEFAULT	0
+
+static unsigned int rq_mode __read_mostly = MD_RQ_MODE_DEFAULT;
+static struct kmem_cache *md_request_clone_cache __read_mostly;
+
+#endif	/* MD_REQUEST_FUNCTION */
+
+#if defined BIO_ACCOUNTING_EXTENSION || defined MD_REQUEST_FUNCTION
+
+#include <linux/ratelimit.h>
+
 static DEFINE_RATELIMIT_STATE(md_ratelimit_state,
 			DEFAULT_RATELIMIT_INTERVAL,
 			DEFAULT_RATELIMIT_BURST);
@@ -78,7 +100,7 @@ static inline int __must_check md_valid_ptr(const void *p)
 }
 #define VALID_PTR(p)	md_valid_ptr(p)
 
-#endif	/* BIO_ACCOUNTING_EXTENSION */
+#endif	/* BIO_ACCOUNTING_EXTENSION || MD_REQUEST_FUNCTION */
 
 #ifndef MODULE
 static void autostart_arrays(int part);
@@ -326,31 +348,17 @@ static void md_bio_endio(struct bio *bio, int err)
 
 #endif	/* BIO_ACCOUNTING_EXTENSION */
 
-/* Rather than calling directly into the personality make_request function,
- * IO requests come here first so that we can check if the device is
- * being suspended pending a reconfiguration.
- * We hold a refcount over the call to ->make_request.  By the time that
- * call has finished, the bio has been linked into some internal structure
- * and so is visible to ->quiesce(), so we don't need the refcount any more.
- */
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static inline int md_make_request_head(struct mddev *mddev, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
-	struct mddev *mddev = q->queuedata;
-	int cpu;
-	unsigned int sectors;
-#ifdef BIO_ACCOUNTING_EXTENSION
-	struct md_bio_private *mbp;
-#endif	/* BIO_ACCOUNTING_EXTENSION */
 
-	if (mddev == NULL || mddev->pers == NULL
-	    || !mddev->ready) {
+	if (mddev == NULL || mddev->pers == NULL || !mddev->ready) {
 		bio_io_error(bio);
-		return;
+		return 1;
 	}
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
-		return;
+		return 1;
 	}
 	smp_rmb(); /* Ensure implications of  'active' are visible */
 	rcu_read_lock();
@@ -369,6 +377,39 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	}
 	atomic_inc(&mddev->active_io);
 	rcu_read_unlock();
+	return 0;
+}
+
+static inline void md_make_request_body(struct mddev *mddev, struct bio *bio)
+{
+	mddev->pers->make_request(mddev, bio);
+}
+
+static inline void md_make_request_tail(struct mddev *mddev, struct bio *bio)
+{
+	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+		wake_up(&mddev->sb_wait);
+}
+
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static void md_make_request(struct request_queue *q, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+	struct mddev *mddev = q->queuedata;
+	int cpu;
+	unsigned int sectors;
+#ifdef BIO_ACCOUNTING_EXTENSION
+	struct md_bio_private *mbp;
+#endif	/* BIO_ACCOUNTING_EXTENSION */
+
+	if (unlikely(md_make_request_head(mddev, bio)))
+		return;
 
 	/*
 	 * save the sectors now since our bio can
@@ -397,7 +438,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 		bio->bi_private = mbp;
 	}
 #endif	/* BIO_ACCOUNTING_EXTENSION */
-	mddev->pers->make_request(mddev, bio);
+	md_make_request_body(mddev, bio);
 
 #ifndef BIO_ACCOUNTING_EXTENSION
 	cpu = part_stat_lock();
@@ -406,10 +447,131 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 #endif	/* !BIO_ACCOUNTING_EXTENSION */
 
-	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
-		wake_up(&mddev->sb_wait);
+	md_make_request_tail(mddev, bio);
+}
+
+#ifdef MD_REQUEST_FUNCTION
+
+static inline void md_make_request_bio(struct mddev *mddev, struct bio *bio)
+{
+	if (unlikely(md_make_request_head(mddev, bio)))
+		return;
+	md_make_request_body(mddev, bio);
+	md_make_request_tail(mddev, bio);
+}
+
+static inline void md_request_clone_release(struct md_request_clone *rcl)
+{
+	if (atomic_dec_and_test(&rcl->cnt)) {
+		blk_end_request_all(rcl->req, rcl->err);
+		kmem_cache_free(md_request_clone_cache, rcl);
+	}
+}
+
+static void md_request_bio_endio(struct bio *bio, int err)
+{
+	struct md_request_clone *rcl = bio->bi_private;
+
+	if (unlikely(err < 0))
+		rcl->err = err;
+
+	bio_put(bio);
+	md_request_clone_release(rcl);
+}
+
+static void md_request_clone_worker(struct work_struct *wkp)
+{
+	struct md_request_clone *rcl =
+		container_of(wkp, struct md_request_clone, work);
+	struct bio_list *blp = &rcl->bios;
+	struct mddev *mddev = rcl->mdp;
+	struct bio *bio;
+
+	bio = bio_list_pop(blp);
+	while (VALID_PTR(bio)) {
+		md_make_request_bio(mddev, bio);
+		bio = bio_list_pop(blp);
+	}
+	md_request_clone_release(rcl);
 }
 
+static inline int md_process_request(struct mddev *mddev, struct request *req)
+{
+	struct md_request_clone *rcl;
+
+	struct bio *bio, *clone;
+	int error;
+
+	rcl = kmem_cache_alloc(md_request_clone_cache, GFP_NOIO);
+	if (unlikely(!VALID_PTR(rcl))) {
+		if (__ratelimit(&md_ratelimit_state))
+			pr_warn("%s: [%s] kmem_cache_alloc failed\n",
+				__func__, mdname(mddev));
+		return -ENOMEM;
+	}
+	rcl->err = 0;
+	rcl->req = req;
+	rcl->mdp = mddev;
+	atomic_set(&rcl->cnt, 1);
+	bio_list_init(&rcl->bios);
+	bio = req->bio;
+	while (VALID_PTR(bio)) {
+		clone = bio_clone(bio, GFP_NOWAIT);
+		if (unlikely(!VALID_PTR(clone))) {
+			if (__ratelimit(&md_ratelimit_state))
+				pr_warn("%s: [%s] bio_clone failed\n",
+					__func__, mdname(mddev));
+			error = -ENOMEM;
+			goto error_out;
+		}
+		clone->bi_private = rcl;
+		clone->bi_end_io = md_request_bio_endio;
+		bio_list_add(&rcl->bios, clone);
+		atomic_inc(&rcl->cnt);
+		bio = bio->bi_next;
+	}
+	INIT_WORK(&rcl->work, md_request_clone_worker);
+	queue_work(mddev->request_wq, &rcl->work);
+	return 0;
+error_out:
+	bio = bio_list_pop(&rcl->bios);
+	while (VALID_PTR(bio)) {
+		bio_put(bio);
+		bio = bio_list_pop(&rcl->bios);
+	}
+	kmem_cache_free(md_request_clone_cache, rcl);
+	return error;
+}
+
+#ifndef blk_fs_request
+#define blk_fs_request(p)	((p)->cmd_type == REQ_TYPE_FS)
+#endif	/* !blk_fs_request */
+
+static void md_request_function(struct request_queue *rqp)
+{
+	struct mddev *mddev = rqp->queuedata;
+
+	struct request *req;
+	int rc;
+
+	while ((req = blk_fetch_request(rqp)) != NULL) {
+		if (unlikely(!blk_fs_request(req))) {
+			if (__ratelimit(&md_ratelimit_state))
+				pr_warn("%s: [%s] non-fs request\n",
+					__func__, mdname(mddev));
+			__blk_end_request_all(req, -ENOTSUPP);
+			continue;
+		}
+		spin_unlock_irq(rqp->queue_lock);
+		rc = md_process_request(mddev, req);
+		spin_lock_irq(rqp->queue_lock);
+		if (unlikely(rc < 0))
+			__blk_end_request_all(req, rc);
+	}
+}
+
+#endif	/* MD_REQUEST_FUNCTION */
+
 /* mddev_suspend makes sure no new requests are submitted
  * to the device, and that any requests that have been submitted
  * are completely handled.
@@ -567,8 +729,15 @@ static void mddev_put(struct mddev *mddev)
 			 */
 			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
 			queue_work(md_misc_wq, &mddev->del_work);
-		} else
+		} else {
+#ifdef MD_REQUEST_FUNCTION
+			if (likely(VALID_PTR(mddev->request_wq))) {
+				destroy_workqueue(mddev->request_wq);
+				mddev->request_wq = NULL;
+			}
+#endif	/* MD_REQUEST_FUNCTION */
 			kfree(mddev);
+		}
 	}
 	spin_unlock(&all_mddevs_lock);
 	if (bs)
@@ -4923,6 +5092,13 @@ static void md_free(struct kobject *ko)
 	if (mddev->queue)
 		blk_cleanup_queue(mddev->queue);
 
+#ifdef MD_REQUEST_FUNCTION
+	if (likely(VALID_PTR(mddev->request_wq))) {
+		destroy_workqueue(mddev->request_wq);
+		mddev->request_wq = NULL;
+	}
+#endif	/* MD_REQUEST_FUNCTION */
+
 	kfree(mddev);
 }
 
@@ -4990,12 +5166,32 @@ static int md_alloc(dev_t dev, char *name)
 	}
 
 	error = -ENOMEM;
-	mddev->queue = blk_alloc_queue(GFP_KERNEL);
-	if (!mddev->queue)
-		goto abort;
+#ifdef MD_REQUEST_FUNCTION
+	if (!rq_mode) {
+#endif	/* MD_REQUEST_FUNCTION */
+		mddev->queue = blk_alloc_queue(GFP_KERNEL);
+		if (!mddev->queue)
+			goto abort;
+		blk_queue_make_request(mddev->queue, md_make_request);
+#ifdef MD_REQUEST_FUNCTION
+	} else {
+		mddev->request_wq =
+			create_singlethread_workqueue(mdname(mddev));
+		if (unlikely(!VALID_PTR(mddev->request_wq))) {
+			pr_warn("%s: create_singlethread_workqueue (%s) "
+				"failed\n", __func__, mdname(mddev));
+			goto abort;
+		}
+		mddev->queue = blk_init_queue(md_request_function, NULL);
+		if (!mddev->queue) {
+			destroy_workqueue(mddev->request_wq);
+			mddev->request_wq = NULL;
+			goto abort;
+		}
+	}
+#endif	/* MD_REQUEST_FUNCTION */
 	mddev->queue->queuedata = mddev;
 
-	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);
 
 	disk = alloc_disk(1 << shift);
@@ -8714,11 +8910,23 @@ static int __init md_init(void)
 #ifdef BIO_ACCOUNTING_EXTENSION
 	md_bio_private_cache = KMEM_CACHE(md_bio_private, 0);
 	if (unlikely(!VALID_PTR(md_bio_private_cache))) {
-		pr_err("%s: KMEM_CACHE failed\n", __func__);
+		pr_err("%s: KMEM_CACHE (bio_priv) failed\n", __func__);
 		return -ENOMEM;
 	}
 #endif	/* BIO_ACCOUNTING_EXTENSION */
 
+#ifdef MD_REQUEST_FUNCTION
+	md_request_clone_cache = KMEM_CACHE(md_request_clone, 0);
+	if (unlikely(!VALID_PTR(md_request_clone_cache))) {
+		pr_err("%s: KMEM_CACHE (req_clone) failed\n", __func__);
+#ifdef BIO_ACCOUNTING_EXTENSION
+		kmem_cache_destroy(md_bio_private_cache);
+		md_bio_private_cache = NULL;
+#endif	/* BIO_ACCOUNTING_EXTENSION */
+		return -ENOMEM;
+	}
+#endif	/* MD_REQUEST_FUNCTION */
+
 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
 	if (!md_wq)
 		goto err_wq;
@@ -8856,6 +9064,13 @@ static __exit void md_exit(void)
 	destroy_workqueue(md_misc_wq);
 	destroy_workqueue(md_wq);
 
+#ifdef MD_REQUEST_FUNCTION
+	if (likely(VALID_PTR(md_request_clone_cache))) {
+		kmem_cache_destroy(md_request_clone_cache);
+		md_request_clone_cache = NULL;
+	}
+#endif	/* MD_REQUEST_FUNCTION */
+
 #ifdef BIO_ACCOUNTING_EXTENSION
 	if (likely(VALID_PTR(md_bio_private_cache))) {
 		kmem_cache_destroy(md_bio_private_cache);
@@ -8887,6 +9102,11 @@ module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 
 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
 
+#ifdef MD_REQUEST_FUNCTION
+module_param(rq_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(rq_mode, " this module's io input mode (default: 0 [make request mode])");
+#endif	/* MD_REQUEST_FUNCTION */
+
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
 EXPORT_SYMBOL(md_error);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f0e9171..8d639e0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -25,6 +25,10 @@
 #include <linux/workqueue.h>
 
 #if 1
+#define MD_REQUEST_FUNCTION
+#endif
+
+#if 1
 #define BIO_ACCOUNTING_EXTENSION
 #endif
 
@@ -455,6 +459,9 @@ struct mddev {
 #ifdef BIO_ACCOUNTING_EXTENSION
 	struct md_stats stats;
 #endif	/* BIO_ACCOUNTING_EXTENSION */
+#ifdef MD_REQUEST_FUNCTION
+	struct workqueue_struct *request_wq;
+#endif	/* MD_REQUEST_FUNCTION */
 };
 
 
-- 
1.7.9.5


  parent reply	other threads:[~2014-06-04 17:10 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-28 13:04 [RFC] Process requests instead of bios to use a scheduler Sebastian Parschauer
2014-06-01 23:32 ` NeilBrown
2014-06-02  9:51   ` Sebastian Parschauer
2014-06-02 10:20     ` NeilBrown
2014-06-02 11:12       ` Sebastian Parschauer
2014-06-04 17:09       ` [RFC PATCH 0/4] md/mdadm: introduce request function mode support Sebastian Parschauer
2014-06-04 17:09         ` [RFC PATCH 1/4] md: complete bio accounting and add io_latency extension Sebastian Parschauer
2014-06-04 17:10         ` Sebastian Parschauer [this message]
2014-06-04 17:10         ` [RFC PATCH 3/4] md: handle IO latency accounting in rqfn mode Sebastian Parschauer
2014-06-04 17:10         ` [RFC PATCH 4/4] mdadm: introduce '--use-requestfn' create/assembly option Sebastian Parschauer
2014-06-17 13:20         ` [RFC PATCH 0/4] md/mdadm: introduce request function mode support Sebastian Parschauer
     [not found]           ` <CAH3kUhEK26+4KryoReosMt654-vcrkkgkxaW5tKkFRDBqgX82w@mail.gmail.com>
     [not found]             ` <53A14513.20902@profitbricks.com>
2014-06-18 13:57               ` Roberto Spadim
2014-06-18 14:43                 ` Sebastian Parschauer
2014-06-24  7:09           ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1401901802-16296-3-git-send-email-sebastian.riemer@profitbricks.com \
    --to=sebastian.riemer@profitbricks.com \
    --cc=florian-ewald.mueller@profitbricks.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.