linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] nbd fixes for this cycle
@ 2017-03-24 18:08 Josef Bacik
  2017-03-24 18:08 ` [PATCH 1/4] nbd: handle ERESTARTSYS properly Josef Bacik
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Josef Bacik @ 2017-03-24 18:08 UTC (permalink / raw)
  To: axboe, nbd-general, linux-block, kernel-team

These 4 patches are to fix up various regressions and problems in NBD.  The
ERESTARTSYS is the biggest patch but has been pretty well tested with a debug
patch that forced the behavior to happen.  Everything else is relatively small,
and the queue timeout patch is a regression from last cycle.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/4] nbd: handle ERESTARTSYS properly
  2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
@ 2017-03-24 18:08 ` Josef Bacik
  2017-03-24 18:08 ` [PATCH 2/4] nbd: set rq->errors to actual error code Josef Bacik
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2017-03-24 18:08 UTC (permalink / raw)
  To: axboe, nbd-general, linux-block, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We can submit IO in a processes context, which means there can be
pending signals.  This isn't a fatal error for NBD, but it does require
some finesse.  If the signal happens before we transmit anything then we
are ok, just requeue the request and carry on.  However if we've done a
partial transmit we can't allow anything else to be transmitted on this
socket until we transmit the remaining part of the request.  Deal with
this by keeping track of how much we've sent for the current request,
and if we get an ERESTARTSYS during any part of our transmission save
the state of that request and requeue the IO.  If anybody tries to
submit a request that isn't our pending request then requeue that
request until we are able to service the one that is pending.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 115 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 26 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7e4287b..3d1fc37a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -47,6 +47,8 @@ static DEFINE_MUTEX(nbd_index_mutex);
 struct nbd_sock {
 	struct socket *sock;
 	struct mutex tx_lock;
+	struct request *pending;
+	int sent;
 };
 
 #define NBD_TIMEDOUT			0
@@ -202,7 +204,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
  *  Send or receive packet.
  */
 static int sock_xmit(struct nbd_device *nbd, int index, int send,
-		     struct iov_iter *iter, int msg_flags)
+		     struct iov_iter *iter, int msg_flags, int *sent)
 {
 	struct socket *sock = nbd->socks[index]->sock;
 	int result;
@@ -237,6 +239,8 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 				result = -EPIPE; /* short read */
 			break;
 		}
+		if (sent)
+			*sent += result;
 	} while (msg_data_left(&msg));
 
 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
@@ -248,6 +252,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
+	struct nbd_sock *nsock = nbd->socks[index];
 	int result;
 	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
@@ -256,6 +261,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	struct bio *bio;
 	u32 type;
 	u32 tag = blk_mq_unique_tag(req);
+	int sent = nsock->sent, skip = 0;
 
 	iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
 
@@ -283,6 +289,17 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 		return -EIO;
 	}
 
+	/* We did a partial send previously, and we at least sent the whole
+	 * request struct, so just go and send the rest of the pages in the
+	 * request.
+	 */
+	if (sent) {
+		if (sent >= sizeof(request)) {
+			skip = sent - sizeof(request);
+			goto send_pages;
+		}
+		iov_iter_advance(&from, sent);
+	}
 	request.type = htonl(type);
 	if (type != NBD_CMD_FLUSH) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
@@ -294,15 +311,27 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 	result = sock_xmit(nbd, index, 1, &from,
-			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
+			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 	if (result <= 0) {
+		if (result == -ERESTARTSYS) {
+			/* If we havne't sent anything we can just return BUSY,
+			 * however if we have sent something we need to make
+			 * sure we only allow this req to be sent until we are
+			 * completely done.
+			 */
+			if (sent) {
+				nsock->pending = req;
+				nsock->sent = sent;
+			}
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		}
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 			"Send control failed (result %d)\n", result);
 		return -EIO;
 	}
-
+send_pages:
 	if (type != NBD_CMD_WRITE)
-		return 0;
+		goto out;
 
 	bio = req->bio;
 	while (bio) {
@@ -318,8 +347,25 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 				cmd, bvec.bv_len);
 			iov_iter_bvec(&from, ITER_BVEC | WRITE,
 				      &bvec, 1, bvec.bv_len);
-			result = sock_xmit(nbd, index, 1, &from, flags);
+			if (skip) {
+				if (skip >= iov_iter_count(&from)) {
+					skip -= iov_iter_count(&from);
+					continue;
+				}
+				iov_iter_advance(&from, skip);
+				skip = 0;
+			}
+			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 			if (result <= 0) {
+				if (result == -ERESTARTSYS) {
+					/* We've already sent the header, we
+					 * have no choice but to set pending and
+					 * return BUSY.
+					 */
+					nsock->pending = req;
+					nsock->sent = sent;
+					return BLK_MQ_RQ_QUEUE_BUSY;
+				}
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
 					result);
@@ -336,6 +382,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 		}
 		bio = next;
 	}
+out:
+	nsock->pending = NULL;
+	nsock->sent = 0;
 	return 0;
 }
 
@@ -353,7 +402,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 
 	reply.magic = 0;
 	iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
-	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 	if (result <= 0) {
 		if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
 		    !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
@@ -395,7 +444,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 		rq_for_each_segment(bvec, req, iter) {
 			iov_iter_bvec(&to, ITER_BVEC | READ,
 				      &bvec, 1, bvec.bv_len);
-			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL);
+			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
@@ -482,22 +531,23 @@ static void nbd_clear_que(struct nbd_device *nbd)
 }
 
 
-static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
+static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	struct nbd_device *nbd = cmd->nbd;
 	struct nbd_sock *nsock;
+	int ret;
 
 	if (index >= nbd->num_connections) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Attempted send on invalid socket\n");
-		goto error_out;
+		return -EINVAL;
 	}
 
 	if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Attempted send on closed socket\n");
-		goto error_out;
+		return -EINVAL;
 	}
 
 	req->errors = 0;
@@ -508,29 +558,30 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 		mutex_unlock(&nsock->tx_lock);
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Attempted send on closed socket\n");
-		goto error_out;
+		return -EINVAL;
 	}
 
-	if (nbd_send_cmd(nbd, cmd, index) != 0) {
-		dev_err_ratelimited(disk_to_dev(nbd->disk),
-				    "Request send failed\n");
-		req->errors++;
-		nbd_end_request(cmd);
+	/* Handle the case that we have a pending request that was partially
+	 * transmitted that _has_ to be serviced first.  We need to call requeue
+	 * here so that it gets put _after_ the request that is already on the
+	 * dispatch list.
+	 */
+	if (unlikely(nsock->pending && nsock->pending != req)) {
+		blk_mq_requeue_request(req, true);
+		ret = 0;
+		goto out;
 	}
-
+	ret = nbd_send_cmd(nbd, cmd, index);
+out:
 	mutex_unlock(&nsock->tx_lock);
-
-	return;
-
-error_out:
-	req->errors++;
-	nbd_end_request(cmd);
+	return ret;
 }
 
 static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 			const struct blk_mq_queue_data *bd)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	int ret;
 
 	/*
 	 * Since we look at the bio's to send the request over the network we
@@ -543,10 +594,20 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 	init_completion(&cmd->send_complete);
 	blk_mq_start_request(bd->rq);
-	nbd_handle_cmd(cmd, hctx->queue_num);
+
+	/* We can be called directly from the user space process, which means we
+	 * could possibly have signals pending so our sendmsg will fail.  In
+	 * this case we need to return that we are busy, otherwise error out as
+	 * appropriate.
+	 */
+	ret = nbd_handle_cmd(cmd, hctx->queue_num);
+	if (ret < 0)
+		ret = BLK_MQ_RQ_QUEUE_ERROR;
+	if (!ret)
+		ret = BLK_MQ_RQ_QUEUE_OK;
 	complete(&cmd->send_complete);
 
-	return BLK_MQ_RQ_QUEUE_OK;
+	return ret;
 }
 
 static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
@@ -581,6 +642,8 @@ static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
 
 	mutex_init(&nsock->tx_lock);
 	nsock->sock = sock;
+	nsock->pending = NULL;
+	nsock->sent = 0;
 	socks[nbd->num_connections++] = nsock;
 
 	if (max_part)
@@ -634,7 +697,7 @@ static void send_disconnects(struct nbd_device *nbd)
 
 	for (i = 0; i < nbd->num_connections; i++) {
 		iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
-		ret = sock_xmit(nbd, i, 1, &from, 0);
+		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
 		if (ret <= 0)
 			dev_err(disk_to_dev(nbd->disk),
 				"Send disconnect failed %d\n", ret);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] nbd: set rq->errors to actual error code
  2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
  2017-03-24 18:08 ` [PATCH 1/4] nbd: handle ERESTARTSYS properly Josef Bacik
@ 2017-03-24 18:08 ` Josef Bacik
  2017-03-24 18:08 ` [PATCH 3/4] nbd: set queue timeout properly Josef Bacik
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2017-03-24 18:08 UTC (permalink / raw)
  To: axboe, nbd-general, linux-block, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We've been relying on the block layer to assume rq->errors being set
translates into -EIO.  I noticed in testing that sometimes this isn't
true, and really there's not much of a reason to have a counter instead
of just using -EIO.  So set it properly so we don't leak random numbers
to unsuspecting victims.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3d1fc37a..dbc22f4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -192,7 +192,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 
 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
-	req->errors++;
+	req->errors = -EIO;
 
 	mutex_lock(&nbd->config_lock);
 	sock_shutdown(nbd);
@@ -432,7 +432,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 	if (ntohl(reply.error)) {
 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 			ntohl(reply.error));
-		req->errors++;
+		req->errors = -EIO;
 		return cmd;
 	}
 
@@ -448,7 +448,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
-				req->errors++;
+				req->errors = -EIO;
 				return cmd;
 			}
 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
@@ -518,7 +518,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
 	if (!blk_mq_request_started(req))
 		return;
 	cmd = blk_mq_rq_to_pdu(req);
-	req->errors++;
+	req->errors = -EIO;
 	nbd_end_request(cmd);
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] nbd: set queue timeout properly
  2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
  2017-03-24 18:08 ` [PATCH 1/4] nbd: handle ERESTARTSYS properly Josef Bacik
  2017-03-24 18:08 ` [PATCH 2/4] nbd: set rq->errors to actual error code Josef Bacik
@ 2017-03-24 18:08 ` Josef Bacik
  2017-03-24 18:08 ` [PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device() Josef Bacik
  2017-03-24 18:57 ` [PATCH 0/4] nbd fixes for this cycle Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2017-03-24 18:08 UTC (permalink / raw)
  To: axboe, nbd-general, linux-block, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We can't just set the timeout on the tagset, we have to set it on the
queue as it would have been setup already at this point.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index dbc22f4..b0003da 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -844,7 +844,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		nbd_size_set(nbd, bdev, nbd->blksize, arg);
 		return 0;
 	case NBD_SET_TIMEOUT:
-		nbd->tag_set.timeout = arg * HZ;
+		if (arg) {
+			nbd->tag_set.timeout = arg * HZ;
+			blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
+		}
 		return 0;
 
 	case NBD_SET_FLAGS:
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device()
  2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
                   ` (2 preceding siblings ...)
  2017-03-24 18:08 ` [PATCH 3/4] nbd: set queue timeout properly Josef Bacik
@ 2017-03-24 18:08 ` Josef Bacik
  2017-03-24 18:57 ` [PATCH 0/4] nbd fixes for this cycle Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2017-03-24 18:08 UTC (permalink / raw)
  To: axboe, nbd-general, linux-block, kernel-team; +Cc: Ratna Manoj Bolla

From: Ratna Manoj Bolla <manoj.br@gmail.com>

When a filesystem is mounted on a nbd device and on a disconnect, because
of kill_bdev(), and resetting bdev size to zero, buffer_head mappings are
getting destroyed under mounted filesystem.

After a bdev size reset(i.e bdev->bd_inode->i_size = 0) on a disconnect,
followed by a sys_umount(),
        generic_shutdown_super()->...
        ->__sync_blockdev()->...
        -blkdev_writepages()->...
        ->do_invalidatepage()->...
        -discard_buffer()   is discarding superblock buffer_head assumed
to be in mapped state by ext4_commit_super().

[mlin: ported to 4.11-rc2]
Signed-off-by: Ratna Manoj Bolla <manoj.br@gmail.com
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index b0003da..d8a2356 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -126,7 +126,8 @@ static const char *nbdcmd_to_ascii(int cmd)
 
 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 {
-	bd_set_size(bdev, 0);
+	if (bdev->bd_openers <= 1)
+		bd_set_size(bdev, 0);
 	set_capacity(nbd->disk, 0);
 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 
@@ -665,6 +666,8 @@ static void nbd_reset(struct nbd_device *nbd)
 
 static void nbd_bdev_reset(struct block_device *bdev)
 {
+	if (bdev->bd_openers > 1)
+		return;
 	set_device_ro(bdev, false);
 	bdev->bd_inode->i_size = 0;
 	if (max_part > 0) {
@@ -728,7 +731,8 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
 {
 	sock_shutdown(nbd);
 	nbd_clear_que(nbd);
-	kill_bdev(bdev);
+
+	__invalidate_device(bdev, true);
 	nbd_bdev_reset(bdev);
 	/*
 	 * We want to give the run thread a chance to wait for everybody
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 0/4] nbd fixes for this cycle
  2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
                   ` (3 preceding siblings ...)
  2017-03-24 18:08 ` [PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device() Josef Bacik
@ 2017-03-24 18:57 ` Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2017-03-24 18:57 UTC (permalink / raw)
  To: Josef Bacik, nbd-general, linux-block, kernel-team

On 03/24/2017 12:08 PM, Josef Bacik wrote:
> These 4 patches are to fix up various regressions and problems in NBD.  The
> ERESTARTSYS is the biggest patch but has been pretty well tested with a debug
> patch that forced the behavior to happen.  Everything else is relatively small,
> and the queue timeout patch is a regression from last cycle.  Thanks,

Added for 4.11.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-03-24 18:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-24 18:08 [PATCH 0/4] nbd fixes for this cycle Josef Bacik
2017-03-24 18:08 ` [PATCH 1/4] nbd: handle ERESTARTSYS properly Josef Bacik
2017-03-24 18:08 ` [PATCH 2/4] nbd: set rq->errors to actual error code Josef Bacik
2017-03-24 18:08 ` [PATCH 3/4] nbd: set queue timeout properly Josef Bacik
2017-03-24 18:08 ` [PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device() Josef Bacik
2017-03-24 18:57 ` [PATCH 0/4] nbd fixes for this cycle Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).