From: Vivek Goyal <vgoyal@redhat.com>
To: Kent Overstreet <koverstreet@google.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>,
linux-bcache@vger.kernel.org, linux-kernel@vger.kernel.org,
dm-devel@redhat.com, tj@kernel.org, bharrosh@panasas.com,
Jens Axboe <axboe@kernel.dk>
Subject: Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
Date: Thu, 30 Aug 2012 18:07:45 -0400 [thread overview]
Message-ID: <20120830220745.GI27257@redhat.com> (raw)
In-Reply-To: <20120829171345.GC20312@google.com>
On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
[..]
> > Performance aside, punting submission to per device worker in case of deep
> > stack usage sounds cleaner solution to me.
>
> Agreed, but performance tends to matter in the real world. And either
> way the tricky bits are going to be confined to a few functions, so I
> don't think it matters that much.
>
> If someone wants to code up the workqueue version and test it, they're
> more than welcome...
Here is one quick and dirty proof of concept patch. It checks for stack
depth and if remaining space is less than 20% of stack size, then it
defers the bio submission to per queue worker.
Thanks
Vivek
---
block/blk-core.c | 171 ++++++++++++++++++++++++++++++++++------------
block/blk-sysfs.c | 1
include/linux/blk_types.h | 1
include/linux/blkdev.h | 8 ++
4 files changed, 138 insertions(+), 43 deletions(-)
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h 2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/include/linux/blkdev.h 2012-09-01 18:09:58.805577658 -0400
@@ -430,6 +430,14 @@ struct request_queue {
/* Throttle data */
struct throtl_data *td;
#endif
+
+ /*
+ * Bio submission to queue can be deferred to a workqueue if stack
+ * usage of submitter is high.
+ */
+ struct bio_list deferred_bios;
+ struct work_struct deferred_bio_work;
+ struct workqueue_struct *deferred_bio_workqueue;
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c 2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/block/blk-core.c 2012-09-02 00:34:55.204091269 -0400
@@ -211,6 +211,23 @@ static void blk_delay_work(struct work_s
spin_unlock_irq(q->queue_lock);
}
+static void blk_deferred_bio_work(struct work_struct *work)
+{
+ struct request_queue *q;
+ struct bio *bio = NULL;
+
+ q = container_of(work, struct request_queue, deferred_bio_work);
+
+ do {
+ spin_lock_irq(q->queue_lock);
+ bio = bio_list_pop(&q->deferred_bios);
+ spin_unlock_irq(q->queue_lock);
+ if (!bio)
+ break;
+ generic_make_request(bio);
+ } while (1);
+}
+
/**
* blk_delay_queue - restart queueing after defined interval
* @q: The &struct request_queue in question
@@ -289,6 +306,7 @@ void blk_sync_queue(struct request_queue
{
del_timer_sync(&q->timeout);
cancel_delayed_work_sync(&q->delay_work);
+ cancel_work_sync(&q->deferred_bio_work);
}
EXPORT_SYMBOL(blk_sync_queue);
@@ -351,6 +369,29 @@ void blk_put_queue(struct request_queue
EXPORT_SYMBOL(blk_put_queue);
/**
+ * blk_drain_deferred_bios - drain deferred bios
+ * @q: request_queue to drain deferred bios for
+ *
+ * Dispatch all currently deferred bios on @q through ->make_request_fn().
+ */
+static void blk_drain_deferred_bios(struct request_queue *q)
+{
+ struct bio_list bl;
+ struct bio *bio;
+ unsigned long flags;
+
+ bio_list_init(&bl);
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ bio_list_merge(&bl, &q->deferred_bios);
+ bio_list_init(&q->deferred_bios);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ while ((bio = bio_list_pop(&bl)))
+ generic_make_request(bio);
+}
+
+/**
* blk_drain_queue - drain requests from request_queue
* @q: queue to drain
* @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
@@ -358,6 +399,10 @@ EXPORT_SYMBOL(blk_put_queue);
* Drain requests from @q. If @drain_all is set, all requests are drained.
* If not, only ELVPRIV requests are drained. The caller is responsible
* for ensuring that no new requests which need to be drained are queued.
+ *
+ * Note: It does not drain bios on q->deferred_bios list.
+ * Call blk_drain_deferred_bios() if need be.
+ *
*/
void blk_drain_queue(struct request_queue *q, bool drain_all)
{
@@ -505,6 +550,9 @@ void blk_cleanup_queue(struct request_qu
spin_unlock_irq(lock);
mutex_unlock(&q->sysfs_lock);
+ /* First drain all deferred bios. */
+ blk_drain_deferred_bios(q);
+
/* drain all requests queued before DEAD marking */
blk_drain_queue(q, true);
@@ -614,11 +662,19 @@ struct request_queue *blk_alloc_queue_no
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
- if (blkcg_init_queue(q))
+ bio_list_init(&q->deferred_bios);
+ INIT_WORK(&q->deferred_bio_work, blk_deferred_bio_work);
+ q->deferred_bio_workqueue = alloc_workqueue("kdeferbiod", WQ_MEM_RECLAIM, 0);
+ if (!q->deferred_bio_workqueue)
goto fail_id;
+ if (blkcg_init_queue(q))
+ goto fail_deferred_bio_wq;
+
return q;
+fail_deferred_bio_wq:
+ destroy_workqueue(q->deferred_bio_workqueue);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
@@ -1635,8 +1691,10 @@ static inline int bio_check_eod(struct b
return 0;
}
+
+
static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks_early(struct bio *bio)
{
struct request_queue *q;
int nr_sectors = bio_sectors(bio);
@@ -1715,9 +1773,6 @@ generic_make_request_checks(struct bio *
*/
create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
-
trace_block_bio_queue(q, bio);
return true;
@@ -1726,6 +1781,56 @@ end_io:
return false;
}
+static noinline_for_stack bool
+generic_make_request_checks_late(struct bio *bio)
+{
+ struct request_queue *q;
+
+ q = bdev_get_queue(bio->bi_bdev);
+
+ /*
+ * Various block parts want %current->io_context and lazy ioc
+ * allocation ends up trading a lot of pain for a small amount of
+ * memory. Just allocate it upfront. This may fail and block
+ * layer knows how to live with it.
+ */
+ create_io_context(GFP_ATOMIC, q->node);
+
+ if (blk_throtl_bio(q, bio))
+ return false; /* throttled, will be resubmitted later */
+
+ return true;
+}
+
+static void __generic_make_request(struct bio *bio)
+{
+ struct request_queue *q;
+
+ if (!generic_make_request_checks_late(bio))
+ return;
+ q = bdev_get_queue(bio->bi_bdev);
+ q->make_request_fn(q, bio);
+}
+
+static void generic_make_request_defer_bio(struct bio *bio)
+{
+ struct request_queue *q;
+ unsigned long flags;
+
+ q = bdev_get_queue(bio->bi_bdev);
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ bio_endio(bio, -ENODEV);
+ return;
+ }
+ set_bit(BIO_DEFERRED, &bio->bi_flags);
+ bio_list_add(&q->deferred_bios, bio);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ queue_work(q->deferred_bio_workqueue, &q->deferred_bio_work);
+}
+
/**
* generic_make_request - hand a buffer to its device driver for I/O
* @bio: The bio describing the location in memory and on the device.
@@ -1752,51 +1857,31 @@ end_io:
*/
void generic_make_request(struct bio *bio)
{
- struct bio_list bio_list_on_stack;
+ unsigned long sp = 0;
+ unsigned int threshold = (THREAD_SIZE * 2)/10;
- if (!generic_make_request_checks(bio))
- return;
+ BUG_ON(bio->bi_next);
- /*
- * We only want one ->make_request_fn to be active at a time, else
- * stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
- * flag to say if generic_make_request is currently active in this
- * task or not. If it is NULL, then no make_request is active. If
- * it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
- */
- if (current->bio_list) {
- bio_list_add(current->bio_list, bio);
+ /* Submitteing deferred bio from worker context. */
+ if (bio_flagged(bio, BIO_DEFERRED)) {
+ clear_bit(BIO_DEFERRED, &bio->bi_flags);
+ __generic_make_request(bio);
return;
}
- /* following loop may be a bit non-obvious, and so deserves some
- * explanation.
- * Before entering the loop, bio->bi_next is NULL (as all callers
- * ensure that) so we have a list with a single bio.
- * We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->make_request() may indeed add some more bios
- * through a recursive call to generic_make_request. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->make_request() again.
- */
- BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack);
- current->bio_list = &bio_list_on_stack;
- do {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (!generic_make_request_checks_early(bio))
+ return;
- q->make_request_fn(q, bio);
+ /*
+ * FIXME. Provide an arch dependent function to return left stack
+ * space for current task. This is hack for x86_64.
+ */
+ asm volatile("movq %%rsp,%0" : "=m"(sp));
- bio = bio_list_pop(current->bio_list);
- } while (bio);
- current->bio_list = NULL; /* deactivate */
+ if ((sp - (unsigned long)end_of_stack(current)) < threshold)
+ generic_make_request_defer_bio(bio);
+ else
+ __generic_make_request(bio);
}
EXPORT_SYMBOL(generic_make_request);
Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c 2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/block/blk-sysfs.c 2012-09-01 18:09:58.808577661 -0400
@@ -505,6 +505,7 @@ static void blk_release_queue(struct kob
ida_simple_remove(&blk_queue_ida, q->id);
kmem_cache_free(blk_requestq_cachep, q);
+ destroy_workqueue(q->deferred_bio_workqueue);
}
static const struct sysfs_ops queue_sysfs_ops = {
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h 2012-09-02 00:34:17.607086696 -0400
+++ linux-2.6/include/linux/blk_types.h 2012-09-02 00:34:21.997087104 -0400
@@ -105,6 +105,7 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_DEFERRED 12 /* Bio was deferred for submission by worker */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
next prev parent reply other threads:[~2012-08-30 22:08 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 1/9] block: Generalized bio pool freeing Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 2/9] dm: Use bioset's front_pad for dm_rq_clone_bio_info Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 3/9] block: Add bio_reset() Kent Overstreet
2012-08-28 20:31 ` Tejun Heo
2012-08-28 22:17 ` Kent Overstreet
2012-08-28 22:53 ` Kent Overstreet
2012-09-01 2:23 ` Tejun Heo
2012-09-05 20:13 ` Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc() Kent Overstreet
2012-08-28 20:32 ` Tejun Heo
2012-08-28 22:24 ` Kent Overstreet
2012-09-04 9:05 ` Jiri Kosina
2012-09-05 19:44 ` Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 5/9] block: Kill bi_destructor Kent Overstreet
2012-08-28 20:36 ` Tejun Heo
2012-08-28 22:07 ` Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc() Kent Overstreet
2012-08-28 20:41 ` Tejun Heo
2012-08-28 22:03 ` Kent Overstreet
2012-09-01 2:17 ` Tejun Heo
2012-08-28 17:37 ` [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc() Kent Overstreet
2012-08-28 20:44 ` Tejun Heo
2012-08-28 22:05 ` Kent Overstreet
2012-09-01 2:19 ` Tejun Heo
2012-08-28 17:37 ` [PATCH v7 8/9] block: Reorder struct bio_set Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Kent Overstreet
2012-08-28 20:49 ` Tejun Heo
2012-08-28 22:28 ` Kent Overstreet
2012-08-28 23:01 ` Kent Overstreet
2012-08-29 1:31 ` Vivek Goyal
2012-08-29 3:25 ` Kent Overstreet
2012-08-29 12:57 ` Vivek Goyal
2012-08-29 14:39 ` [dm-devel] " Alasdair G Kergon
2012-08-29 16:26 ` Kent Overstreet
2012-08-29 21:01 ` John Stoffel
2012-08-29 21:08 ` Kent Overstreet
2012-08-28 22:06 ` Vivek Goyal
2012-08-28 22:23 ` Kent Overstreet
2012-08-29 16:24 ` Mikulas Patocka
2012-08-29 16:50 ` Kent Overstreet
2012-08-29 16:57 ` [dm-devel] " Alasdair G Kergon
2012-08-29 17:07 ` Vivek Goyal
2012-08-29 17:13 ` Kent Overstreet
2012-08-29 17:23 ` [dm-devel] " Alasdair G Kergon
2012-08-29 17:32 ` Kent Overstreet
2012-08-30 22:07 ` Vivek Goyal [this message]
2012-08-31 1:43 ` Kent Overstreet
2012-08-31 1:55 ` Kent Overstreet
2012-08-31 15:01 ` Vivek Goyal
2012-09-03 1:26 ` Kent Overstreet
2012-09-03 20:41 ` Mikulas Patocka
2012-09-04 3:41 ` Kent Overstreet
2012-09-04 18:55 ` Tejun Heo
2012-09-04 19:01 ` Tejun Heo
2012-09-04 19:43 ` Kent Overstreet
2012-09-04 19:42 ` Kent Overstreet
2012-09-04 21:03 ` Tejun Heo
2012-09-04 19:26 ` Mikulas Patocka
2012-09-04 19:39 ` Vivek Goyal
2012-09-04 19:51 ` [PATCH] dm: Use bioset's front_pad for dm_target_io Kent Overstreet
2012-09-04 21:20 ` Tejun Heo
2012-09-11 19:28 ` [PATCH 2] " Mikulas Patocka
2012-09-11 19:50 ` Kent Overstreet
2012-09-12 22:31 ` Mikulas Patocka
2012-09-14 23:09 ` [dm-devel] " Alasdair G Kergon
2012-09-01 2:13 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Tejun Heo
2012-09-03 1:34 ` [PATCH v2] " Kent Overstreet
2012-09-04 15:00 ` [PATCH v7 9/9] " Vivek Goyal
2012-09-03 0:49 ` Dave Chinner
2012-09-03 1:17 ` Kent Overstreet
2012-09-04 13:54 ` Vivek Goyal
2012-09-04 18:26 ` Tejun Heo
2012-09-05 3:57 ` Dave Chinner
2012-09-05 4:37 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20120830220745.GI27257@redhat.com \
--to=vgoyal@redhat.com \
--cc=axboe@kernel.dk \
--cc=bharrosh@panasas.com \
--cc=dm-devel@redhat.com \
--cc=koverstreet@google.com \
--cc=linux-bcache@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mpatocka@redhat.com \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).