All of lore.kernel.org
 help / color / mirror / Atom feed
From: Philipp Reisner <philipp.reisner@linbit.com>
To: Jens Axboe <axboe@fb.com>, linux-kernel@vger.kernel.org
Cc: drbd-dev@lists.linbit.com
Subject: [PATCH 04/30] drbd: Implement handling of thinly provisioned storage on resync target nodes
Date: Mon, 13 Jun 2016 16:08:52 +0200	[thread overview]
Message-ID: <1465826958-19398-5-git-send-email-philipp.reisner@linbit.com> (raw)
In-Reply-To: <1465826958-19398-1-git-send-email-philipp.reisner@linbit.com>

If during resync we read only zeroes for a range of sectors assume
that these secotors can be discarded on the sync target node.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  5 +++
 drivers/block/drbd/drbd_main.c     | 18 ++++++++
 drivers/block/drbd/drbd_protocol.h |  4 ++
 drivers/block/drbd/drbd_receiver.c | 88 ++++++++++++++++++++++++++++++++++++--
 drivers/block/drbd/drbd_worker.c   | 29 ++++++++++++-
 5 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 7a1cf7e..1a93f4f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -471,6 +471,9 @@ enum {
 	/* this originates from application on peer
 	 * (not some resync or verify or other DRBD internal request) */
 	__EE_APPLICATION,
+
+	/* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
+	__EE_RS_THIN_REQ,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
@@ -485,6 +488,7 @@ enum {
 #define EE_SUBMITTED		(1<<__EE_SUBMITTED)
 #define EE_WRITE		(1<<__EE_WRITE)
 #define EE_APPLICATION		(1<<__EE_APPLICATION)
+#define EE_RS_THIN_REQ		(1<<__EE_RS_THIN_REQ)
 
 /* flag bits per device */
 enum {
@@ -1123,6 +1127,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
 extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 4c64cb9..dd2432e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1377,6 +1377,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
 			      cpu_to_be64(block_id));
 }
 
+int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
+			     struct drbd_peer_request *peer_req)
+{
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->blksize = cpu_to_be32(peer_req->i.size);
+	p->pad = 0;
+	return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
+}
+
 int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
 		       sector_t sector, int size, u64 block_id)
 {
@@ -3681,6 +3697,8 @@ const char *cmdname(enum drbd_packet cmd)
 		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
 		[P_RETRY_WRITE]		= "retry_write",
 		[P_PROTOCOL_UPDATE]	= "protocol_update",
+		[P_RS_THIN_REQ]         = "rs_thin_req",
+		[P_RS_DEALLOCATED]      = "rs_deallocated",
 
 		/* enum drbd_packet, but not commands - obsoleted flags:
 		 *	P_MAY_IGNORE
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index ef92453..e5e74e3 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -60,6 +60,10 @@ enum drbd_packet {
 	 * which is why I chose TRIM here, to disambiguate. */
 	P_TRIM                = 0x31,
 
+	/* Only use these two if both support FF_THIN_RESYNC */
+	P_RS_THIN_REQ         = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
+	P_RS_DEALLOCATED      = 0x33, /* Contains only zeros on sync source node */
+
 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8b30ab5..3a6c2ec 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1417,9 +1417,15 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		 * so we can find it to present it in debugfs */
 		peer_req->submit_jif = jiffies;
 		peer_req->flags |= EE_SUBMITTED;
-		spin_lock_irq(&device->resource->req_lock);
-		list_add_tail(&peer_req->w.list, &device->active_ee);
-		spin_unlock_irq(&device->resource->req_lock);
+
+		/* If this was a resync request from receive_rs_deallocated(),
+		 * it is already on the sync_ee list */
+		if (list_empty(&peer_req->w.list)) {
+			spin_lock_irq(&device->resource->req_lock);
+			list_add_tail(&peer_req->w.list, &device->active_ee);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
 			sector, data_size >> 9, GFP_NOIO, false))
 			peer_req->flags |= EE_WAS_ERROR;
@@ -2574,6 +2580,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		case P_DATA_REQUEST:
 			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
 			break;
+		case P_RS_THIN_REQ:
 		case P_RS_DATA_REQUEST:
 		case P_CSUM_RS_REQUEST:
 		case P_OV_REQUEST:
@@ -2613,6 +2620,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		peer_req->flags |= EE_APPLICATION;
 		goto submit;
 
+	case P_RS_THIN_REQ:
+		/* If at some point in the future we have a smart way to
+		   find out if this data block is completely deallocated,
+		   then we would do something smarter here than reading
+		   the block... */
+		peer_req->flags |= EE_RS_THIN_REQ;
 	case P_RS_DATA_REQUEST:
 		peer_req->w.cb = w_e_end_rsdata_req;
 		fault_type = DRBD_FAULT_RS_RD;
@@ -4587,6 +4600,72 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
 	return 0;
 }
 
+static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_block_desc *p = pi->data;
+	struct drbd_device *device;
+	sector_t sector;
+	int size, err = 0;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	dec_rs_pending(device);
+
+	if (get_ldev(device)) {
+		struct drbd_peer_request *peer_req;
+		const int rw = REQ_WRITE | REQ_DISCARD;
+
+		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
+					       size, false, GFP_NOIO);
+		if (!peer_req) {
+			put_ldev(device);
+			return -ENOMEM;
+		}
+
+		peer_req->w.cb = e_end_resync_block;
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_IS_TRIM;
+
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->sync_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+
+		atomic_add(pi->size >> 9, &device->rs_sect_ev);
+		err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_RS_WR);
+
+		if (err) {
+			spin_lock_irq(&device->resource->req_lock);
+			list_del(&peer_req->w.list);
+			spin_unlock_irq(&device->resource->req_lock);
+
+			drbd_free_peer_req(device, peer_req);
+			put_ldev(device);
+			err = 0;
+			goto fail;
+		}
+
+		inc_unacked(device);
+
+		/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
+		   as well as drbd_rs_complete_io() */
+	} else {
+	fail:
+		drbd_rs_complete_io(device, sector);
+		drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
+	}
+
+	atomic_add(size >> 9, &device->rs_sect_in);
+
+	return err;
+}
+
 struct data_cmd {
 	int expect_payload;
 	size_t pkt_size;
@@ -4614,11 +4693,14 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
 	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
 	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
 	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
 	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
 	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
 	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
 	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
+	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
+
 };
 
 static void drbdd(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d87499..01d74ee2 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1035,6 +1035,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
 	return err;
 }
 
+static bool all_zero(struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	unsigned int len = peer_req->i.size;
+
+	page_chain_for_each(page) {
+		unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
+		unsigned int i, words = l / sizeof(long);
+		unsigned long *d;
+
+		d = kmap_atomic(page);
+		for (i = 0; i < words; i++) {
+			if (d[i]) {
+				kunmap_atomic(d);
+				return false;
+			}
+		}
+		kunmap_atomic(d);
+		len -= l;
+	}
+
+	return true;
+}
+
 /**
  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
  * @w:		work object.
@@ -1063,7 +1087,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
 	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
 		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
 			inc_rs_pending(device);
-			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+			if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
+				err = drbd_send_rs_deallocated(peer_device, peer_req);
+			else
+				err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
 		} else {
 			if (__ratelimit(&drbd_ratelimit_state))
 				drbd_err(device, "Not sending RSDataReply, "
-- 
2.7.4

  parent reply	other threads:[~2016-06-13 14:22 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-13 14:08 [PATCH 00/30] DRBD updates Philipp Reisner
2016-06-13 14:08 ` [PATCH 01/30] drbd: bitmap bulk IO: do not always suspend IO Philipp Reisner
2016-06-13 14:08 ` [PATCH 02/30] drbd: change bitmap write-out when leaving resync states Philipp Reisner
2016-06-13 14:08 ` [PATCH 03/30] drbd: Kill code duplication Philipp Reisner
2016-06-13 14:08 ` Philipp Reisner [this message]
2016-06-13 14:08 ` [PATCH 05/30] drbd: Introduce new disk config option rs-discard-granularity Philipp Reisner
2016-06-13 14:08 ` [PATCH 06/30] drbd: Create the protocol feature THIN_RESYNC Philipp Reisner
2016-06-13 14:08 ` [PATCH 07/30] drbd: adjust assert in w_bitmap_io to account for BM_LOCKED_CHANGE_ALLOWED Philipp Reisner
2016-06-13 14:08 ` [PATCH 08/30] drbd: fix regression: protocol A sometimes synchronous, C sometimes double-latency Philipp Reisner
2016-06-13 14:08 ` [PATCH 09/30] drbd: fix for truncated minor number in callback command line Philipp Reisner
2016-06-13 14:08 ` [PATCH 10/30] drbd: allow parallel flushes for multi-volume resources Philipp Reisner
2016-06-13 14:08 ` [PATCH 11/30] drbd: when receiving P_TRIM, zero-out partial unaligned chunks Philipp Reisner
2016-06-13 14:09 ` [PATCH 12/30] drbd: possibly disable discard support, if backend has discard_zeroes_data=0 Philipp Reisner
2016-06-13 14:09 ` [PATCH 13/30] drbd: zero-out partial unaligned discards on local backend Philipp Reisner
2016-06-13 14:09 ` [PATCH 14/30] drbd: allow larger max_discard_sectors Philipp Reisner
2016-06-13 14:09 ` [PATCH 15/30] drbd: finish resync on sync source only by notification from sync target Philipp Reisner
2016-06-13 14:09 ` [PATCH 16/30] drbd: introduce unfence-peer handler Philipp Reisner
2016-06-13 14:09 ` [PATCH 17/30] drbd: don't forget error completion when "unsuspending" IO Philipp Reisner
2016-06-13 14:09 ` [PATCH 18/30] drbd: if there is no good data accessible, writes should be IO errors Philipp Reisner
2016-06-13 14:09 ` [PATCH 19/30] drbd: only restart frozen disk io when D_UP_TO_DATE Philipp Reisner
2016-06-13 14:09 ` [PATCH 20/30] drbd: discard_zeroes_if_aligned allows "thin" resync for discard_zeroes_data=0 Philipp Reisner
2016-06-13 14:09 ` [PATCH 21/30] drbd: report sizes if rejecting too small peer disk Philipp Reisner
2016-06-13 14:09 ` [PATCH 22/30] drbd: introduce WRITE_SAME support Philipp Reisner
2016-06-13 14:09 ` [PATCH 23/30] drbd: sync_handshake: handle identical uuids with current (frozen) Primary Philipp Reisner
2016-06-13 14:09 ` [PATCH 24/30] drbd: disallow promotion during resync handshake, avoid deadlock and hard reset Philipp Reisner
2016-06-13 14:09 ` [PATCH 25/30] drbd: bump current uuid when resuming IO with diskless peer Philipp Reisner
2016-06-13 14:09 ` [PATCH 26/30] drbd: code cleanups without semantic changes Philipp Reisner
2016-06-13 14:09 ` [PATCH 27/30] drbd: get rid of empty statement in is_valid_state Philipp Reisner
2016-06-13 14:09 ` [PATCH 28/30] drbd: finally report ms, not jiffies, in log message Philipp Reisner
2016-06-13 14:09 ` [PATCH 29/30] drbd: al_write_transaction: skip re-scanning of bitmap page pointer array Philipp Reisner
2016-06-13 14:09 ` [PATCH 30/30] drbd: correctly handle failed crypto_alloc_hash Philipp Reisner
2016-06-13 15:11 ` [PATCH 00/30] DRBD updates Jens Axboe
2016-06-13 22:26   ` Philipp Reisner
2016-06-13 22:26   ` [PATCH 01/30] drbd: bitmap bulk IO: do not always suspend IO Philipp Reisner
2016-06-13 22:26   ` [PATCH 02/30] drbd: change bitmap write-out when leaving resync states Philipp Reisner
2016-06-13 22:26   ` [PATCH 03/30] drbd: Kill code duplication Philipp Reisner
2016-06-13 22:26   ` [PATCH 04/30] drbd: Implement handling of thinly provisioned storage on resync target nodes Philipp Reisner
2016-06-13 22:26   ` [PATCH 05/30] drbd: Introduce new disk config option rs-discard-granularity Philipp Reisner
2016-06-13 22:26   ` [PATCH 06/30] drbd: Create the protocol feature THIN_RESYNC Philipp Reisner
2016-06-13 22:26   ` [PATCH 07/30] drbd: adjust assert in w_bitmap_io to account for BM_LOCKED_CHANGE_ALLOWED Philipp Reisner
2016-06-13 22:26   ` [PATCH 08/30] drbd: fix regression: protocol A sometimes synchronous, C sometimes double-latency Philipp Reisner
2016-06-13 22:26   ` [PATCH 09/30] drbd: fix for truncated minor number in callback command line Philipp Reisner
2016-06-13 22:26   ` [PATCH 10/30] drbd: allow parallel flushes for multi-volume resources Philipp Reisner
2016-06-13 22:26   ` [PATCH 11/30] drbd: when receiving P_TRIM, zero-out partial unaligned chunks Philipp Reisner
2016-06-13 22:26   ` [PATCH 12/30] drbd: possibly disable discard support, if backend has discard_zeroes_data=0 Philipp Reisner
2016-06-13 22:26   ` [PATCH 13/30] drbd: zero-out partial unaligned discards on local backend Philipp Reisner
2016-06-13 22:26   ` [PATCH 14/30] drbd: allow larger max_discard_sectors Philipp Reisner
2016-06-13 22:26   ` [PATCH 15/30] drbd: finish resync on sync source only by notification from sync target Philipp Reisner
2016-06-13 22:26   ` [PATCH 16/30] drbd: introduce unfence-peer handler Philipp Reisner
2016-06-13 22:26   ` [PATCH 17/30] drbd: don't forget error completion when "unsuspending" IO Philipp Reisner
2016-06-13 22:26   ` [PATCH 18/30] drbd: if there is no good data accessible, writes should be IO errors Philipp Reisner
2016-06-13 22:26   ` [PATCH 19/30] drbd: only restart frozen disk io when D_UP_TO_DATE Philipp Reisner
2016-06-13 22:26   ` [PATCH 20/30] drbd: discard_zeroes_if_aligned allows "thin" resync for discard_zeroes_data=0 Philipp Reisner
2016-06-13 22:26   ` [PATCH 21/30] drbd: report sizes if rejecting too small peer disk Philipp Reisner
2016-06-13 22:26   ` [PATCH 22/30] drbd: introduce WRITE_SAME support Philipp Reisner
2016-06-13 22:26   ` [PATCH 23/30] drbd: sync_handshake: handle identical uuids with current (frozen) Primary Philipp Reisner
2016-06-13 22:26   ` [PATCH 24/30] drbd: disallow promotion during resync handshake, avoid deadlock and hard reset Philipp Reisner
2016-06-13 22:26   ` [PATCH 25/30] drbd: bump current uuid when resuming IO with diskless peer Philipp Reisner
2016-06-13 22:26   ` [PATCH 26/30] drbd: code cleanups without semantic changes Philipp Reisner
2016-06-13 22:26   ` [PATCH 27/30] drbd: get rid of empty statement in is_valid_state Philipp Reisner
2016-06-13 22:26   ` [PATCH 28/30] drbd: finally report ms, not jiffies, in log message Philipp Reisner
2016-06-13 22:26   ` [PATCH 29/30] drbd: al_write_transaction: skip re-scanning of bitmap page pointer array Philipp Reisner
2016-06-13 22:26   ` [PATCH 30/30] drbd: correctly handle failed crypto_alloc_hash Philipp Reisner
  -- strict thread matches above, loose matches on Subject: below --
2016-04-25 12:07 [PATCH 00/30] DBRD updates Philipp Reisner
2016-04-25 12:07 ` [PATCH 04/30] drbd: Implement handling of thinly provisioned storage on resync target nodes Philipp Reisner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1465826958-19398-5-git-send-email-philipp.reisner@linbit.com \
    --to=philipp.reisner@linbit.com \
    --cc=axboe@fb.com \
    --cc=drbd-dev@lists.linbit.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.