linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Philipp Reisner <philipp.reisner@linbit.com>
To: Jens Axboe <axboe@fb.com>, linux-kernel@vger.kernel.org
Cc: drbd-dev@lists.linbit.com
Subject: [PATCH 10/30] drbd: allow parallel flushes for multi-volume resources
Date: Mon, 13 Jun 2016 16:08:58 +0200	[thread overview]
Message-ID: <1465826958-19398-11-git-send-email-philipp.reisner@linbit.com> (raw)
In-Reply-To: <1465826958-19398-1-git-send-email-philipp.reisner@linbit.com>

From: Lars Ellenberg <lars.ellenberg@linbit.com>

To maintain write-order fidelity accros all volumes in a DRBD resource,
the receiver of a P_BARRIER needs to issue flushes to all volumes.
We used to do this by calling blkdev_issue_flush(), synchronously,
one volume at a time.

We now submit all flushes to all volumes in parallel, then wait for all
completions, to reduce worst-case latencies on multi-volume resources.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 113 +++++++++++++++++++++++++++++--------
 1 file changed, 88 insertions(+), 25 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4cfc721..a2e7ba9 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1204,13 +1204,83 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in
 	return err;
 }
 
-static void drbd_flush(struct drbd_connection *connection)
+/* This is blkdev_issue_flush, but asynchronous.
+ * We want to submit to all component volumes in parallel,
+ * then wait for all completions.
+ */
+struct issue_flush_context {
+	atomic_t pending;
+	int error;
+	struct completion done;
+};
+struct one_flush_context {
+	struct drbd_device *device;
+	struct issue_flush_context *ctx;
+};
+
+void one_flush_endio(struct bio *bio)
 {
-	int rv;
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	struct one_flush_context *octx = bio->bi_private;
+	struct drbd_device *device = octx->device;
+	struct issue_flush_context *ctx = octx->ctx;
+
+	if (bio->bi_error) {
+		ctx->error = bio->bi_error;
+		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
+	}
+	kfree(octx);
+	bio_put(bio);
+
+	clear_bit(FLUSH_PENDING, &device->flags);
+	put_ldev(device);
+	kref_put(&device->kref, drbd_destroy_device);
+
+	if (atomic_dec_and_test(&ctx->pending))
+		complete(&ctx->done);
+}
+
+static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
+{
+	struct bio *bio = bio_alloc(GFP_NOIO, 0);
+	struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
+	if (!bio || !octx) {
+		drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
+		/* FIXME: what else can I do now?  disconnecting or detaching
+		 * really does not help to improve the state of the world, either.
+		 */
+		kfree(octx);
+		if (bio)
+			bio_put(bio);
 
+		ctx->error = -ENOMEM;
+		put_ldev(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		return;
+	}
+
+	octx->device = device;
+	octx->ctx = ctx;
+	bio->bi_bdev = device->ldev->backing_bdev;
+	bio->bi_private = octx;
+	bio->bi_end_io = one_flush_endio;
+
+	device->flush_jif = jiffies;
+	set_bit(FLUSH_PENDING, &device->flags);
+	atomic_inc(&ctx->pending);
+	submit_bio(WRITE_FLUSH, bio);
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
 	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
+		struct drbd_peer_device *peer_device;
+		struct issue_flush_context ctx;
+		int vnr;
+
+		atomic_set(&ctx.pending, 1);
+		ctx.error = 0;
+		init_completion(&ctx.done);
+
 		rcu_read_lock();
 		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 			struct drbd_device *device = peer_device->device;
@@ -1220,31 +1290,24 @@ static void drbd_flush(struct drbd_connection *connection)
 			kref_get(&device->kref);
 			rcu_read_unlock();
 
-			/* Right now, we have only this one synchronous code path
-			 * for flushes between request epochs.
-			 * We may want to make those asynchronous,
-			 * or at least parallelize the flushes to the volume devices.
-			 */
-			device->flush_jif = jiffies;
-			set_bit(FLUSH_PENDING, &device->flags);
-			rv = blkdev_issue_flush(device->ldev->backing_bdev,
-					GFP_NOIO, NULL);
-			clear_bit(FLUSH_PENDING, &device->flags);
-			if (rv) {
-				drbd_info(device, "local disk flush failed with status %d\n", rv);
-				/* would rather check on EOPNOTSUPP, but that is not reliable.
-				 * don't try again for ANY return value != 0
-				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
-			}
-			put_ldev(device);
-			kref_put(&device->kref, drbd_destroy_device);
+			submit_one_flush(device, &ctx);
 
 			rcu_read_lock();
-			if (rv)
-				break;
 		}
 		rcu_read_unlock();
+
+		/* Do we want to add a timeout,
+		 * if disk-timeout is set? */
+		if (!atomic_dec_and_test(&ctx.pending))
+			wait_for_completion(&ctx.done);
+
+		if (ctx.error) {
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			/* Any error is already reported by bio_endio callback. */
+			drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
+		}
 	}
 }
 
-- 
2.7.4

  parent reply	other threads:[~2016-06-13 14:26 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-13 14:08 [PATCH 00/30] DRBD updates Philipp Reisner
2016-06-13 14:08 ` [PATCH 01/30] drbd: bitmap bulk IO: do not always suspend IO Philipp Reisner
2016-06-13 14:08 ` [PATCH 02/30] drbd: change bitmap write-out when leaving resync states Philipp Reisner
2016-06-13 14:08 ` [PATCH 03/30] drbd: Kill code duplication Philipp Reisner
2016-06-13 14:08 ` [PATCH 04/30] drbd: Implement handling of thinly provisioned storage on resync target nodes Philipp Reisner
2016-06-13 14:08 ` [PATCH 05/30] drbd: Introduce new disk config option rs-discard-granularity Philipp Reisner
2016-06-13 14:08 ` [PATCH 06/30] drbd: Create the protocol feature THIN_RESYNC Philipp Reisner
2016-06-13 14:08 ` [PATCH 07/30] drbd: adjust assert in w_bitmap_io to account for BM_LOCKED_CHANGE_ALLOWED Philipp Reisner
2016-06-13 14:08 ` [PATCH 08/30] drbd: fix regression: protocol A sometimes synchronous, C sometimes double-latency Philipp Reisner
2016-06-13 14:08 ` [PATCH 09/30] drbd: fix for truncated minor number in callback command line Philipp Reisner
2016-06-13 14:08 ` Philipp Reisner [this message]
2016-06-13 14:08 ` [PATCH 11/30] drbd: when receiving P_TRIM, zero-out partial unaligned chunks Philipp Reisner
2016-06-13 14:09 ` [PATCH 12/30] drbd: possibly disable discard support, if backend has discard_zeroes_data=0 Philipp Reisner
2016-06-13 14:09 ` [PATCH 13/30] drbd: zero-out partial unaligned discards on local backend Philipp Reisner
2016-06-13 14:09 ` [PATCH 14/30] drbd: allow larger max_discard_sectors Philipp Reisner
2016-06-13 14:09 ` [PATCH 15/30] drbd: finish resync on sync source only by notification from sync target Philipp Reisner
2016-06-13 14:09 ` [PATCH 16/30] drbd: introduce unfence-peer handler Philipp Reisner
2016-06-13 14:09 ` [PATCH 17/30] drbd: don't forget error completion when "unsuspending" IO Philipp Reisner
2016-06-13 14:09 ` [PATCH 18/30] drbd: if there is no good data accessible, writes should be IO errors Philipp Reisner
2016-06-13 14:09 ` [PATCH 19/30] drbd: only restart frozen disk io when D_UP_TO_DATE Philipp Reisner
2016-06-13 14:09 ` [PATCH 20/30] drbd: discard_zeroes_if_aligned allows "thin" resync for discard_zeroes_data=0 Philipp Reisner
2016-06-13 14:09 ` [PATCH 21/30] drbd: report sizes if rejecting too small peer disk Philipp Reisner
2016-06-13 14:09 ` [PATCH 22/30] drbd: introduce WRITE_SAME support Philipp Reisner
2016-06-13 14:09 ` [PATCH 23/30] drbd: sync_handshake: handle identical uuids with current (frozen) Primary Philipp Reisner
2016-06-13 14:09 ` [PATCH 24/30] drbd: disallow promotion during resync handshake, avoid deadlock and hard reset Philipp Reisner
2016-06-13 14:09 ` [PATCH 25/30] drbd: bump current uuid when resuming IO with diskless peer Philipp Reisner
2016-06-13 14:09 ` [PATCH 26/30] drbd: code cleanups without semantic changes Philipp Reisner
2016-06-13 14:09 ` [PATCH 27/30] drbd: get rid of empty statement in is_valid_state Philipp Reisner
2016-06-13 14:09 ` [PATCH 28/30] drbd: finally report ms, not jiffies, in log message Philipp Reisner
2016-06-13 14:09 ` [PATCH 29/30] drbd: al_write_transaction: skip re-scanning of bitmap page pointer array Philipp Reisner
2016-06-13 14:09 ` [PATCH 30/30] drbd: correctly handle failed crypto_alloc_hash Philipp Reisner
2016-06-13 15:11 ` [PATCH 00/30] DRBD updates Jens Axboe
2016-06-13 22:26   ` Philipp Reisner
2016-06-13 22:26   ` [PATCH 01/30] drbd: bitmap bulk IO: do not always suspend IO Philipp Reisner
2016-06-13 22:26   ` [PATCH 02/30] drbd: change bitmap write-out when leaving resync states Philipp Reisner
2016-06-13 22:26   ` [PATCH 03/30] drbd: Kill code duplication Philipp Reisner
2016-06-13 22:26   ` [PATCH 04/30] drbd: Implement handling of thinly provisioned storage on resync target nodes Philipp Reisner
2016-06-13 22:26   ` [PATCH 05/30] drbd: Introduce new disk config option rs-discard-granularity Philipp Reisner
2016-06-13 22:26   ` [PATCH 06/30] drbd: Create the protocol feature THIN_RESYNC Philipp Reisner
2016-06-13 22:26   ` [PATCH 07/30] drbd: adjust assert in w_bitmap_io to account for BM_LOCKED_CHANGE_ALLOWED Philipp Reisner
2016-06-13 22:26   ` [PATCH 08/30] drbd: fix regression: protocol A sometimes synchronous, C sometimes double-latency Philipp Reisner
2016-06-13 22:26   ` [PATCH 09/30] drbd: fix for truncated minor number in callback command line Philipp Reisner
2016-06-13 22:26   ` [PATCH 10/30] drbd: allow parallel flushes for multi-volume resources Philipp Reisner
2016-06-13 22:26   ` [PATCH 11/30] drbd: when receiving P_TRIM, zero-out partial unaligned chunks Philipp Reisner
2016-06-13 22:26   ` [PATCH 12/30] drbd: possibly disable discard support, if backend has discard_zeroes_data=0 Philipp Reisner
2016-06-13 22:26   ` [PATCH 13/30] drbd: zero-out partial unaligned discards on local backend Philipp Reisner
2016-06-13 22:26   ` [PATCH 14/30] drbd: allow larger max_discard_sectors Philipp Reisner
2016-06-13 22:26   ` [PATCH 15/30] drbd: finish resync on sync source only by notification from sync target Philipp Reisner
2016-06-13 22:26   ` [PATCH 16/30] drbd: introduce unfence-peer handler Philipp Reisner
2016-06-13 22:26   ` [PATCH 17/30] drbd: don't forget error completion when "unsuspending" IO Philipp Reisner
2016-06-13 22:26   ` [PATCH 18/30] drbd: if there is no good data accessible, writes should be IO errors Philipp Reisner
2016-06-13 22:26   ` [PATCH 19/30] drbd: only restart frozen disk io when D_UP_TO_DATE Philipp Reisner
2016-06-13 22:26   ` [PATCH 20/30] drbd: discard_zeroes_if_aligned allows "thin" resync for discard_zeroes_data=0 Philipp Reisner
2016-06-13 22:26   ` [PATCH 21/30] drbd: report sizes if rejecting too small peer disk Philipp Reisner
2016-06-13 22:26   ` [PATCH 22/30] drbd: introduce WRITE_SAME support Philipp Reisner
2016-06-13 22:26   ` [PATCH 23/30] drbd: sync_handshake: handle identical uuids with current (frozen) Primary Philipp Reisner
2016-06-13 22:26   ` [PATCH 24/30] drbd: disallow promotion during resync handshake, avoid deadlock and hard reset Philipp Reisner
2016-06-13 22:26   ` [PATCH 25/30] drbd: bump current uuid when resuming IO with diskless peer Philipp Reisner
2016-06-13 22:26   ` [PATCH 26/30] drbd: code cleanups without semantic changes Philipp Reisner
2016-06-13 22:26   ` [PATCH 27/30] drbd: get rid of empty statement in is_valid_state Philipp Reisner
2016-06-13 22:26   ` [PATCH 28/30] drbd: finally report ms, not jiffies, in log message Philipp Reisner
2016-06-13 22:26   ` [PATCH 29/30] drbd: al_write_transaction: skip re-scanning of bitmap page pointer array Philipp Reisner
2016-06-13 22:26   ` [PATCH 30/30] drbd: correctly handle failed crypto_alloc_hash Philipp Reisner
  -- strict thread matches above, loose matches on Subject: below --
2016-04-25 12:07 [PATCH 00/30] DBRD updates Philipp Reisner
2016-04-25 12:07 ` [PATCH 10/30] drbd: allow parallel flushes for multi-volume resources Philipp Reisner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1465826958-19398-11-git-send-email-philipp.reisner@linbit.com \
    --to=philipp.reisner@linbit.com \
    --cc=axboe@fb.com \
    --cc=drbd-dev@lists.linbit.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).