All of lore.kernel.org
 help / color / mirror / Atom feed
* [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-08 21:12 Josef Bacik
  2016-09-08 21:12 ` [PATCH 1/5] nbd: convert to blkmq Josef Bacik
                   ` (5 more replies)
  0 siblings, 6 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

Apologies if you are getting this a second time, it appears vger ate my last
submission.

----------------------------------------------------------------------

This is a patch series aimed at bringing NBD into 2016.  The two big components
of this series is converting nbd over to using blkmq and then allowing us to
provide more than one connection for a nbd device.  The NBD user space server
doesn't care about how many connections it has to a particular device, so we can
easily open multiple connections to the server and allow blkmq to handle
multi-plexing over the different connections.  This drastically improves
performance with multi-threaded workloads over NBD.  The multi-connection
support requires some changes to the NBD userspace client code, and you can find
that code here

https://github.com/josefbacik/nbd

I have been testing this for a few months and it is pretty solid and gives
excellent performance improvements.  Thanks,

Josef


^ permalink raw reply	[flat|nested] 95+ messages in thread

* [PATCH 1/5] nbd: convert to blkmq
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
@ 2016-09-08 21:12 ` Josef Bacik
  2016-09-08 21:12 ` [PATCH 2/5] nbd: don't shutdown sock with irq's disabled Josef Bacik
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

This moves NBD over to using blkmq, which allows us to get rid of the NBD
wide queue lock and the async submit kthread.  We will start with 1 hw
queue for now, but I plan to add multiple tcp connection support in the
future and we'll fix how we set the hwqueue's.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 337 ++++++++++++++++++++--------------------------------
 1 file changed, 129 insertions(+), 208 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a9e3980..15e7c67 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,6 +34,7 @@
 #include <linux/kthread.h>
 #include <linux/types.h>
 #include <linux/debugfs.h>
+#include <linux/blk-mq.h>
 
 #include <asm/uaccess.h>
 #include <asm/types.h>
@@ -45,12 +46,8 @@ struct nbd_device {
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
-	spinlock_t queue_lock;
-	struct list_head queue_head;	/* Requests waiting result */
-	struct request *active_req;
-	wait_queue_head_t active_wq;
-	struct list_head waiting_queue;	/* Requests to be sent */
-	wait_queue_head_t waiting_wq;
+	atomic_t outstanding_cmds;
+	struct blk_mq_tag_set tag_set;
 
 	struct mutex tx_lock;
 	struct gendisk *disk;
@@ -71,6 +68,11 @@ struct nbd_device {
 #endif
 };
 
+struct nbd_cmd {
+	struct nbd_device *nbd;
+	struct list_head list;
+};
+
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 static struct dentry *nbd_dbg_dir;
 #endif
@@ -83,18 +85,6 @@ static unsigned int nbds_max = 16;
 static struct nbd_device *nbd_dev;
 static int max_part;
 
-/*
- * Use just one lock (or at most 1 per NIC). Two arguments for this:
- * 1. Each NIC is essentially a synchronization point for all servers
- *    accessed through that NIC so there's no need to have more locks
- *    than NICs anyway.
- * 2. More locks lead to more "Dirty cache line bouncing" which will slow
- *    down each lock to the point where they're actually slower than just
- *    a single lock.
- * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
- */
-static DEFINE_SPINLOCK(nbd_lock);
-
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
 	return disk_to_dev(nbd->disk);
@@ -153,18 +143,17 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
 	return 0;
 }
 
-static void nbd_end_request(struct nbd_device *nbd, struct request *req)
+static void nbd_end_request(struct nbd_cmd *cmd)
 {
+	struct nbd_device *nbd = cmd->nbd;
+	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int error = req->errors ? -EIO : 0;
-	struct request_queue *q = req->q;
-	unsigned long flags;
 
-	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
+	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
 		error ? "failed" : "done");
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request_all(req, error);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	atomic_dec(&nbd->outstanding_cmds);
+	blk_mq_complete_request(req, error);
 }
 
 /*
@@ -193,7 +182,7 @@ static void nbd_xmit_timeout(unsigned long arg)
 	struct nbd_device *nbd = (struct nbd_device *)arg;
 	unsigned long flags;
 
-	if (list_empty(&nbd->queue_head))
+	if (!atomic_read(&nbd->outstanding_cmds))
 		return;
 
 	spin_lock_irqsave(&nbd->sock_lock, flags);
@@ -273,8 +262,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *nbd, struct request *req)
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 {
+	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
@@ -298,10 +288,10 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
-	memcpy(request.handle, &req, sizeof(req));
+	memcpy(request.handle, &req->tag, sizeof(req->tag));
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
-		req, nbdcmd_to_ascii(type),
+		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 	result = sock_xmit(nbd, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
@@ -323,7 +313,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 			if (!rq_iter_last(bvec, iter))
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
-				req, bvec.bv_len);
+				cmd, bvec.bv_len);
 			result = sock_send_bvec(nbd, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
@@ -336,29 +326,6 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	return 0;
 }
 
-static struct request *nbd_find_request(struct nbd_device *nbd,
-					struct request *xreq)
-{
-	struct request *req, *tmp;
-	int err;
-
-	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
-	if (unlikely(err))
-		return ERR_PTR(err);
-
-	spin_lock(&nbd->queue_lock);
-	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
-		if (req != xreq)
-			continue;
-		list_del_init(&req->queuelist);
-		spin_unlock(&nbd->queue_lock);
-		return req;
-	}
-	spin_unlock(&nbd->queue_lock);
-
-	return ERR_PTR(-ENOENT);
-}
-
 static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
 	int result;
@@ -370,11 +337,14 @@ static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 {
 	int result;
 	struct nbd_reply reply;
-	struct request *req;
+	struct nbd_cmd *cmd;
+	struct request *req = NULL;
+	u16 hwq;
+	int tag;
 
 	reply.magic = 0;
 	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
@@ -390,25 +360,27 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 		return ERR_PTR(-EPROTO);
 	}
 
-	req = nbd_find_request(nbd, *(struct request **)reply.handle);
-	if (IS_ERR(req)) {
-		result = PTR_ERR(req);
-		if (result != -ENOENT)
-			return ERR_PTR(result);
+	memcpy(&tag, reply.handle, sizeof(int));
 
-		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
-			reply.handle);
-		return ERR_PTR(-EBADR);
+	hwq = blk_mq_unique_tag_to_hwq(tag);
+	if (hwq < nbd->tag_set.nr_hw_queues)
+		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
+				       blk_mq_unique_tag_to_tag(tag));
+	if (!req || !blk_mq_request_started(req)) {
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
+			tag, req);
+		return ERR_PTR(-ENOENT);
 	}
+	cmd = blk_mq_rq_to_pdu(req);
 
 	if (ntohl(reply.error)) {
 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 			ntohl(reply.error));
 		req->errors++;
-		return req;
+		return cmd;
 	}
 
-	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
+	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
 	if (rq_data_dir(req) != WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
@@ -419,13 +391,13 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
 				req->errors++;
-				return req;
+				return cmd;
 			}
 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
-				req, bvec.bv_len);
+				cmd, bvec.bv_len);
 		}
 	}
-	return req;
+	return cmd;
 }
 
 static ssize_t pid_show(struct device *dev,
@@ -444,7 +416,7 @@ static struct device_attribute pid_attr = {
 
 static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 {
-	struct request *req;
+	struct nbd_cmd *cmd;
 	int ret;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
@@ -460,13 +432,13 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 	nbd_size_update(nbd, bdev);
 
 	while (1) {
-		req = nbd_read_stat(nbd);
-		if (IS_ERR(req)) {
-			ret = PTR_ERR(req);
+		cmd = nbd_read_stat(nbd);
+		if (IS_ERR(cmd)) {
+			ret = PTR_ERR(cmd);
 			break;
 		}
 
-		nbd_end_request(nbd, req);
+		nbd_end_request(cmd);
 	}
 
 	nbd_size_clear(nbd, bdev);
@@ -475,44 +447,37 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 	return ret;
 }
 
-static void nbd_clear_que(struct nbd_device *nbd)
+static void nbd_clear_req(struct request *req, void *data, bool reserved)
 {
-	struct request *req;
+	struct nbd_cmd *cmd;
 
+	if (!blk_mq_request_started(req))
+		return;
+	cmd = blk_mq_rq_to_pdu(req);
+	req->errors++;
+	nbd_end_request(cmd);
+}
+
+static void nbd_clear_que(struct nbd_device *nbd)
+{
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/*
 	 * Because we have set nbd->sock to NULL under the tx_lock, all
-	 * modifications to the list must have completed by now.  For
-	 * the same reason, the active_req must be NULL.
-	 *
-	 * As a consequence, we don't need to take the spin lock while
-	 * purging the list here.
+	 * modifications to the list must have completed by now.
 	 */
 	BUG_ON(nbd->sock);
-	BUG_ON(nbd->active_req);
 
-	while (!list_empty(&nbd->queue_head)) {
-		req = list_entry(nbd->queue_head.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		req->errors++;
-		nbd_end_request(nbd, req);
-	}
-
-	while (!list_empty(&nbd->waiting_queue)) {
-		req = list_entry(nbd->waiting_queue.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		req->errors++;
-		nbd_end_request(nbd, req);
-	}
+	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
 
-static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
+static void nbd_handle_cmd(struct nbd_cmd *cmd)
 {
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+	struct nbd_device *nbd = cmd->nbd;
+
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
 
@@ -526,6 +491,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 	req->errors = 0;
 
 	mutex_lock(&nbd->tx_lock);
+	nbd->task_send = current;
 	if (unlikely(!nbd->sock)) {
 		mutex_unlock(&nbd->tx_lock);
 		dev_err(disk_to_dev(nbd->disk),
@@ -533,106 +499,34 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 		goto error_out;
 	}
 
-	nbd->active_req = req;
-
-	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
+	if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds))
 		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
 
-	if (nbd_send_req(nbd, req) != 0) {
+	atomic_inc(&nbd->outstanding_cmds);
+	if (nbd_send_cmd(nbd, cmd) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
-		nbd_end_request(nbd, req);
-	} else {
-		spin_lock(&nbd->queue_lock);
-		list_add_tail(&req->queuelist, &nbd->queue_head);
-		spin_unlock(&nbd->queue_lock);
+		nbd_end_request(cmd);
 	}
 
-	nbd->active_req = NULL;
+	nbd->task_send = NULL;
 	mutex_unlock(&nbd->tx_lock);
-	wake_up_all(&nbd->active_wq);
 
 	return;
 
 error_out:
 	req->errors++;
-	nbd_end_request(nbd, req);
-}
-
-static int nbd_thread_send(void *data)
-{
-	struct nbd_device *nbd = data;
-	struct request *req;
-
-	nbd->task_send = current;
-
-	set_user_nice(current, MIN_NICE);
-	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
-		/* wait for something to do */
-		wait_event_interruptible(nbd->waiting_wq,
-					 kthread_should_stop() ||
-					 !list_empty(&nbd->waiting_queue));
-
-		/* extract request */
-		if (list_empty(&nbd->waiting_queue))
-			continue;
-
-		spin_lock_irq(&nbd->queue_lock);
-		req = list_entry(nbd->waiting_queue.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		spin_unlock_irq(&nbd->queue_lock);
-
-		/* handle request */
-		nbd_handle_req(nbd, req);
-	}
-
-	nbd->task_send = NULL;
-
-	return 0;
+	nbd_end_request(cmd);
 }
 
-/*
- * We always wait for result of write, for now. It would be nice to make it optional
- * in future
- * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
- *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
- */
-
-static void nbd_request_handler(struct request_queue *q)
-		__releases(q->queue_lock) __acquires(q->queue_lock)
+static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
 {
-	struct request *req;
-	
-	while ((req = blk_fetch_request(q)) != NULL) {
-		struct nbd_device *nbd;
-
-		spin_unlock_irq(q->queue_lock);
-
-		nbd = req->rq_disk->private_data;
-
-		BUG_ON(nbd->magic != NBD_MAGIC);
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
-		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
-			req, req->cmd_type);
-
-		if (unlikely(!nbd->sock)) {
-			dev_err_ratelimited(disk_to_dev(nbd->disk),
-					    "Attempted send on closed socket\n");
-			req->errors++;
-			nbd_end_request(nbd, req);
-			spin_lock_irq(q->queue_lock);
-			continue;
-		}
-
-		spin_lock_irq(&nbd->queue_lock);
-		list_add_tail(&req->queuelist, &nbd->waiting_queue);
-		spin_unlock_irq(&nbd->queue_lock);
-
-		wake_up(&nbd->waiting_wq);
-
-		spin_lock_irq(q->queue_lock);
-	}
+	blk_mq_start_request(bd->rq);
+	nbd_handle_cmd(cmd);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
@@ -700,33 +594,37 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
-		struct request sreq;
+		struct request *sreq;
 
 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 		if (!nbd->sock)
 			return -EINVAL;
 
+		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
+		if (!sreq)
+			return -ENOMEM;
+
 		mutex_unlock(&nbd->tx_lock);
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
-		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
+		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
 
 		/* Check again after getting mutex back.  */
-		if (!nbd->sock)
+		if (!nbd->sock) {
+			blk_mq_free_request(sreq);
 			return -EINVAL;
+		}
 
 		nbd->disconnect = true;
 
-		nbd_send_req(nbd, &sreq);
+		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
+		blk_mq_free_request(sreq);
 		return 0;
 	}
  
 	case NBD_CLEAR_SOCK:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
-		BUG_ON(!list_empty(&nbd->queue_head));
-		BUG_ON(!list_empty(&nbd->waiting_queue));
 		kill_bdev(bdev);
 		return 0;
 
@@ -772,7 +670,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_DO_IT: {
-		struct task_struct *thread;
 		int error;
 
 		if (nbd->task_recv)
@@ -786,18 +683,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
 		nbd_parse_flags(nbd, bdev);
 
-		thread = kthread_run(nbd_thread_send, nbd, "%s",
-				     nbd_name(nbd));
-		if (IS_ERR(thread)) {
-			mutex_lock(&nbd->tx_lock);
-			nbd->task_recv = NULL;
-			return PTR_ERR(thread);
-		}
-
 		nbd_dev_dbg_init(nbd);
 		error = nbd_thread_recv(nbd, bdev);
 		nbd_dev_dbg_close(nbd);
-		kthread_stop(thread);
 
 		mutex_lock(&nbd->tx_lock);
 		nbd->task_recv = NULL;
@@ -825,10 +713,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_PRINT_DEBUG:
-		dev_info(disk_to_dev(nbd->disk),
-			"next = %p, prev = %p, head = %p\n",
-			nbd->queue_head.next, nbd->queue_head.prev,
-			&nbd->queue_head);
+		/*
+		 * For compatibility only, we no longer keep a list of
+		 * outstanding requests.
+		 */
 		return 0;
 	}
 	return -ENOTTY;
@@ -987,6 +875,23 @@ static void nbd_dbg_close(void)
 
 #endif
 
+static int nbd_init_request(void *data, struct request *rq,
+			    unsigned int hctx_idx, unsigned int request_idx,
+			    unsigned int numa_node)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->nbd = data;
+	INIT_LIST_HEAD(&cmd->list);
+	return 0;
+}
+
+static struct blk_mq_ops nbd_mq_ops = {
+	.queue_rq	= nbd_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= nbd_init_request,
+};
+
 /*
  * And here should be modules and kernel interface 
  *  (Just smiley confuses emacs :-)
@@ -1035,16 +940,34 @@ static int __init nbd_init(void)
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
+
+		nbd_dev[i].tag_set.ops = &nbd_mq_ops;
+		nbd_dev[i].tag_set.nr_hw_queues = 1;
+		nbd_dev[i].tag_set.queue_depth = 128;
+		nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
+		nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
+		nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+			BLK_MQ_F_SG_MERGE;
+		nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
+
+		err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
+		if (err) {
+			put_disk(disk);
+			goto out;
+		}
+
 		/*
 		 * The new linux 2.5 block layer implementation requires
 		 * every gendisk to have its very own request_queue struct.
 		 * These structs are big so we dynamically allocate them.
 		 */
-		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
+		disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set);
 		if (!disk->queue) {
+			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 			put_disk(disk);
 			goto out;
 		}
+
 		/*
 		 * Tell the block layer that we are not a rotational device
 		 */
@@ -1069,16 +992,12 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].magic = NBD_MAGIC;
-		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
-		spin_lock_init(&nbd_dev[i].queue_lock);
 		spin_lock_init(&nbd_dev[i].sock_lock);
-		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
 		mutex_init(&nbd_dev[i].tx_lock);
 		init_timer(&nbd_dev[i].timeout_timer);
 		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
 		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
-		init_waitqueue_head(&nbd_dev[i].active_wq);
-		init_waitqueue_head(&nbd_dev[i].waiting_wq);
+		atomic_set(&nbd_dev[i].outstanding_cmds, 0);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
@@ -1091,6 +1010,7 @@ static int __init nbd_init(void)
 	return 0;
 out:
 	while (i--) {
+		blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 		blk_cleanup_queue(nbd_dev[i].disk->queue);
 		put_disk(nbd_dev[i].disk);
 	}
@@ -1110,6 +1030,7 @@ static void __exit nbd_cleanup(void)
 		if (disk) {
 			del_gendisk(disk);
 			blk_cleanup_queue(disk->queue);
+			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 			put_disk(disk);
 		}
 	}
-- 
2.8.0.rc2


^ permalink raw reply related	[flat|nested] 95+ messages in thread

* [PATCH 2/5] nbd: don't shutdown sock with irq's disabled
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
  2016-09-08 21:12 ` [PATCH 1/5] nbd: convert to blkmq Josef Bacik
@ 2016-09-08 21:12 ` Josef Bacik
  2016-09-08 21:12 ` [PATCH 3/5] nbd: use flags instead of bool Josef Bacik
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

We hit a warning when shutting down the nbd connection because we have irq's
disabled.  We don't really need to do the shutdown under the lock, just clear
the nbd->sock.  So do the shutdown outside of the irq.  This gets rid of the
warning.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 15e7c67..4b7d0f3 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -161,6 +161,8 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
+	struct socket *sock;
+
 	spin_lock_irq(&nbd->sock_lock);
 
 	if (!nbd->sock) {
@@ -168,18 +170,21 @@ static void sock_shutdown(struct nbd_device *nbd)
 		return;
 	}
 
+	sock = nbd->sock;
 	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
-	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
-	sockfd_put(nbd->sock);
 	nbd->sock = NULL;
 	spin_unlock_irq(&nbd->sock_lock);
 
+	kernel_sock_shutdown(sock, SHUT_RDWR);
+	sockfd_put(sock);
+
 	del_timer(&nbd->timeout_timer);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
 {
 	struct nbd_device *nbd = (struct nbd_device *)arg;
+	struct socket *sock = NULL;
 	unsigned long flags;
 
 	if (!atomic_read(&nbd->outstanding_cmds))
@@ -189,10 +194,16 @@ static void nbd_xmit_timeout(unsigned long arg)
 
 	nbd->timedout = true;
 
-	if (nbd->sock)
-		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+	if (nbd->sock) {
+		sock = nbd->sock;
+		get_file(sock->file);
+	}
 
 	spin_unlock_irqrestore(&nbd->sock_lock, flags);
+	if (sock) {
+		kernel_sock_shutdown(sock, SHUT_RDWR);
+		sockfd_put(sock);
+	}
 
 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 }
-- 
2.8.0.rc2


^ permalink raw reply related	[flat|nested] 95+ messages in thread

* [PATCH 3/5] nbd: use flags instead of bool
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
  2016-09-08 21:12 ` [PATCH 1/5] nbd: convert to blkmq Josef Bacik
  2016-09-08 21:12 ` [PATCH 2/5] nbd: don't shutdown sock with irq's disabled Josef Bacik
@ 2016-09-08 21:12 ` Josef Bacik
  2016-09-09  1:20   ` Joe Perches
  2016-09-08 21:12 ` [PATCH 4/5] nbd: allow block mq to deal with timeouts Josef Bacik
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

In preparation for some future changes, change a few of the state bools over to
normal bits to set/clear properly.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4b7d0f3..cf855a1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,8 +41,12 @@
 
 #include <linux/nbd.h>
 
+#define NBD_TIMEDOUT			0
+#define NBD_DISCONNECT_REQUESTED	1
+
 struct nbd_device {
 	u32 flags;
+	unsigned long runtime_flags;
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
@@ -54,8 +58,6 @@ struct nbd_device {
 	int blksize;
 	loff_t bytesize;
 	int xmit_timeout;
-	bool timedout;
-	bool disconnect; /* a disconnect has been requested by user */
 
 	struct timer_list timeout_timer;
 	/* protects initialization and shutdown of the socket */
@@ -192,7 +194,7 @@ static void nbd_xmit_timeout(unsigned long arg)
 
 	spin_lock_irqsave(&nbd->sock_lock, flags);
 
-	nbd->timedout = true;
+	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
 
 	if (nbd->sock) {
 		sock = nbd->sock;
@@ -562,8 +564,7 @@ out:
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
-	nbd->disconnect = false;
-	nbd->timedout = false;
+	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
@@ -626,7 +627,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			return -EINVAL;
 		}
 
-		nbd->disconnect = true;
+		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
 
 		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
 		blk_mq_free_request(sreq);
@@ -706,9 +707,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		kill_bdev(bdev);
 		nbd_bdev_reset(bdev);
 
-		if (nbd->disconnect) /* user requested, ignore socket errors */
+		/* user requested, ignore socket errors */
+		if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
 			error = 0;
-		if (nbd->timedout)
+		if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
 			error = -ETIMEDOUT;
 
 		nbd_reset(nbd);
-- 
2.8.0.rc2


^ permalink raw reply related	[flat|nested] 95+ messages in thread

* [PATCH 4/5] nbd: allow block mq to deal with timeouts
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
                   ` (2 preceding siblings ...)
  2016-09-08 21:12 ` [PATCH 3/5] nbd: use flags instead of bool Josef Bacik
@ 2016-09-08 21:12 ` Josef Bacik
  2016-09-08 21:12 ` [PATCH 5/5] nbd: add multi-connection support Josef Bacik
  2016-09-09 20:02 ` [Nbd] [RESEND][PATCH 0/5] nbd improvements Wouter Verhelst
  5 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

Instead of rolling our own timer, just utilize the blk mq req timeout and do the
disconnect if any of our commands timeout.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 51 ++++++++++++++-------------------------------------
 1 file changed, 14 insertions(+), 37 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cf855a1..4c6dd1a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -50,16 +50,13 @@ struct nbd_device {
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
-	atomic_t outstanding_cmds;
 	struct blk_mq_tag_set tag_set;
 
 	struct mutex tx_lock;
 	struct gendisk *disk;
 	int blksize;
 	loff_t bytesize;
-	int xmit_timeout;
 
-	struct timer_list timeout_timer;
 	/* protects initialization and shutdown of the socket */
 	spinlock_t sock_lock;
 	struct task_struct *task_recv;
@@ -154,7 +151,6 @@ static void nbd_end_request(struct nbd_cmd *cmd)
 	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
 		error ? "failed" : "done");
 
-	atomic_dec(&nbd->outstanding_cmds);
 	blk_mq_complete_request(req, error);
 }
 
@@ -165,7 +161,7 @@ static void sock_shutdown(struct nbd_device *nbd)
 {
 	struct socket *sock;
 
-	spin_lock_irq(&nbd->sock_lock);
+	spin_lock(&nbd->sock_lock);
 
 	if (!nbd->sock) {
 		spin_unlock_irq(&nbd->sock_lock);
@@ -175,24 +171,20 @@ static void sock_shutdown(struct nbd_device *nbd)
 	sock = nbd->sock;
 	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
 	nbd->sock = NULL;
-	spin_unlock_irq(&nbd->sock_lock);
+	spin_unlock(&nbd->sock_lock);
 
 	kernel_sock_shutdown(sock, SHUT_RDWR);
 	sockfd_put(sock);
-
-	del_timer(&nbd->timeout_timer);
 }
 
-static void nbd_xmit_timeout(unsigned long arg)
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
+						 bool reserved)
 {
-	struct nbd_device *nbd = (struct nbd_device *)arg;
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+	struct nbd_device *nbd = cmd->nbd;
 	struct socket *sock = NULL;
-	unsigned long flags;
-
-	if (!atomic_read(&nbd->outstanding_cmds))
-		return;
 
-	spin_lock_irqsave(&nbd->sock_lock, flags);
+	spin_lock(&nbd->sock_lock);
 
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
 
@@ -201,13 +193,15 @@ static void nbd_xmit_timeout(unsigned long arg)
 		get_file(sock->file);
 	}
 
-	spin_unlock_irqrestore(&nbd->sock_lock, flags);
+	spin_unlock(&nbd->sock_lock);
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
 		sockfd_put(sock);
 	}
 
+	req->errors++;
 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+	return BLK_EH_HANDLED;
 }
 
 /*
@@ -257,9 +251,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 
 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
-	if (!send && nbd->xmit_timeout)
-		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
 	return result;
 }
 
@@ -512,10 +503,6 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 		goto error_out;
 	}
 
-	if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds))
-		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
-	atomic_inc(&nbd->outstanding_cmds);
 	if (nbd_send_cmd(nbd, cmd) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
@@ -569,9 +556,8 @@ static void nbd_reset(struct nbd_device *nbd)
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
-	nbd->xmit_timeout = 0;
+	nbd->tag_set.timeout = 0;
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
-	del_timer_sync(&nbd->timeout_timer);
 }
 
 static void nbd_bdev_reset(struct block_device *bdev)
@@ -668,13 +654,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
 
 	case NBD_SET_TIMEOUT:
-		nbd->xmit_timeout = arg * HZ;
-		if (arg)
-			mod_timer(&nbd->timeout_timer,
-				  jiffies + nbd->xmit_timeout);
-		else
-			del_timer_sync(&nbd->timeout_timer);
-
+		nbd->tag_set.timeout = arg * HZ;
 		return 0;
 
 	case NBD_SET_FLAGS:
@@ -836,7 +816,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
 
 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
 	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
-	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
+	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
 	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
 
@@ -903,6 +883,7 @@ static struct blk_mq_ops nbd_mq_ops = {
 	.queue_rq	= nbd_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_request	= nbd_init_request,
+	.timeout	= nbd_xmit_timeout,
 };
 
 /*
@@ -1007,10 +988,6 @@ static int __init nbd_init(void)
 		nbd_dev[i].magic = NBD_MAGIC;
 		spin_lock_init(&nbd_dev[i].sock_lock);
 		mutex_init(&nbd_dev[i].tx_lock);
-		init_timer(&nbd_dev[i].timeout_timer);
-		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
-		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
-		atomic_set(&nbd_dev[i].outstanding_cmds, 0);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
-- 
2.8.0.rc2


^ permalink raw reply related	[flat|nested] 95+ messages in thread

* [PATCH 5/5] nbd: add multi-connection support
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
                   ` (3 preceding siblings ...)
  2016-09-08 21:12 ` [PATCH 4/5] nbd: allow block mq to deal with timeouts Josef Bacik
@ 2016-09-08 21:12 ` Josef Bacik
  2016-09-10  7:43   ` Christoph Hellwig
  2016-09-09 20:02 ` [Nbd] [RESEND][PATCH 0/5] nbd improvements Wouter Verhelst
  5 siblings, 1 reply; 95+ messages in thread
From: Josef Bacik @ 2016-09-08 21:12 UTC (permalink / raw)
  To: linux-block, linux-kernel, kernel-team, mpa, nbd-general

NBD can become contended on its single connection.  We have to serialize all
writes and we can only process one read response at a time.  Fix this by
allowing userspace to provide multiple connections to a single nbd device.  This
coupled with block-mq drastically increases performance in multi-process cases.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 drivers/block/nbd.c | 267 ++++++++++++++++++++++++++++------------------------
 1 file changed, 143 insertions(+), 124 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4c6dd1a..4aa45ed 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,26 +41,35 @@
 
 #include <linux/nbd.h>
 
+struct nbd_sock {
+	struct socket *sock;
+	struct mutex tx_lock;
+	unsigned int index;
+};
+
 #define NBD_TIMEDOUT			0
 #define NBD_DISCONNECT_REQUESTED	1
+#define NBD_DISCONNECTED		2
 
 struct nbd_device {
 	u32 flags;
 	unsigned long runtime_flags;
-	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
+	struct nbd_sock **socks;
 	int magic;
 
 	struct blk_mq_tag_set tag_set;
 
-	struct mutex tx_lock;
+	struct mutex config_lock;
 	struct gendisk *disk;
+	int num_connections;
+	atomic_t recv_threads;
+	wait_queue_head_t recv_wq;
 	int blksize;
 	loff_t bytesize;
 
 	/* protects initialization and shutdown of the socket */
 	spinlock_t sock_lock;
 	struct task_struct *task_recv;
-	struct task_struct *task_send;
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *dbg_dir;
@@ -69,7 +78,7 @@ struct nbd_device {
 
 struct nbd_cmd {
 	struct nbd_device *nbd;
-	struct list_head list;
+	int index;
 };
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -159,22 +168,18 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
-	struct socket *sock;
-
-	spin_lock(&nbd->sock_lock);
+	int i;
 
-	if (!nbd->sock) {
-		spin_unlock_irq(&nbd->sock_lock);
+	if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
 		return;
-	}
-
-	sock = nbd->sock;
-	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
-	nbd->sock = NULL;
-	spin_unlock(&nbd->sock_lock);
 
-	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sockfd_put(sock);
+	for (i = 0; i < nbd->num_connections; i++) {
+		struct nbd_sock *nsock = nbd->socks[i];
+		mutex_lock(&nsock->tx_lock);
+		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+		mutex_unlock(&nsock->tx_lock);
+	}
+	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 }
 
 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
@@ -182,35 +187,31 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct nbd_device *nbd = cmd->nbd;
-	struct socket *sock = NULL;
-
-	spin_lock(&nbd->sock_lock);
 
+	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
-
-	if (nbd->sock) {
-		sock = nbd->sock;
-		get_file(sock->file);
-	}
-
-	spin_unlock(&nbd->sock_lock);
-	if (sock) {
-		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sockfd_put(sock);
-	}
-
 	req->errors++;
-	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+
+	/*
+	 * If our disconnect packet times out then we're already holding the
+	 * config_lock and could deadlock here, so just set an error and return,
+	 * we'll handle shutting everything down later.
+	 */
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		return BLK_EH_HANDLED;
+	mutex_lock(&nbd->config_lock);
+	sock_shutdown(nbd);
+	mutex_unlock(&nbd->config_lock);
 	return BLK_EH_HANDLED;
 }
 
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
-		int msg_flags)
+static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
+		     int size, int msg_flags)
 {
-	struct socket *sock = nbd->sock;
+	struct socket *sock = nbd->socks[index]->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
@@ -254,12 +255,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
-		int flags)
+static inline int sock_send_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec, int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+	result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset,
 			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
@@ -297,7 +298,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
-	result = sock_xmit(nbd, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, cmd->index, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
@@ -318,7 +319,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 				cmd, bvec.bv_len);
-			result = sock_send_bvec(nbd, &bvec, flags);
+			result = sock_send_bvec(nbd, cmd->index, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -330,18 +331,19 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	return 0;
 }
 
-static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
-			MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset,
+			   bvec->bv_len, MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 {
 	int result;
 	struct nbd_reply reply;
@@ -351,7 +353,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 	int tag;
 
 	reply.magic = 0;
-	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Receive control failed (result %d)\n", result);
@@ -390,7 +392,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		struct bio_vec bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(nbd, &bvec);
+			result = sock_recv_bvec(nbd, index, &bvec);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
@@ -404,39 +406,24 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 	return cmd;
 }
 
-static ssize_t pid_show(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
-
-	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
-}
-
-static struct device_attribute pid_attr = {
-	.attr = { .name = "pid", .mode = S_IRUGO},
-	.show = pid_show,
+struct recv_thread_args {
+	struct work_struct work;
+	struct nbd_device *nbd;
+	int index;
 };
 
-static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
+static void recv_work(struct work_struct *work)
 {
+	struct recv_thread_args *args = container_of(work,
+						     struct recv_thread_args,
+						     work);
+	struct nbd_device *nbd = args->nbd;
 	struct nbd_cmd *cmd;
 	int ret;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
-
-	sk_set_memalloc(nbd->sock->sk);
-
-	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
-	if (ret) {
-		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-		return ret;
-	}
-
-	nbd_size_update(nbd, bdev);
-
 	while (1) {
-		cmd = nbd_read_stat(nbd);
+		cmd = nbd_read_stat(nbd, args->index);
 		if (IS_ERR(cmd)) {
 			ret = PTR_ERR(cmd);
 			break;
@@ -445,10 +432,8 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 		nbd_end_request(cmd);
 	}
 
-	nbd_size_clear(nbd, bdev);
-
-	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-	return ret;
+	atomic_dec(&nbd->recv_threads);
+	wake_up(&nbd->recv_wq);
 }
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -466,12 +451,6 @@ static void nbd_clear_que(struct nbd_device *nbd)
 {
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	/*
-	 * Because we have set nbd->sock to NULL under the tx_lock, all
-	 * modifications to the list must have completed by now.
-	 */
-	BUG_ON(nbd->sock);
-
 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
@@ -481,11 +460,20 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	struct nbd_device *nbd = cmd->nbd;
+	struct nbd_sock *nsock = nbd->socks[cmd->index];
 
-	if (req->cmd_type != REQ_TYPE_FS)
+	if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on closed socket\n");
 		goto error_out;
+	}
 
-	if (rq_data_dir(req) == WRITE &&
+	if (req->cmd_type != REQ_TYPE_FS &&
+	    req->cmd_type != REQ_TYPE_DRV_PRIV)
+		goto error_out;
+
+	if (req->cmd_type == REQ_TYPE_FS &&
+	    rq_data_dir(req) == WRITE &&
 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Write on read-only\n");
@@ -494,10 +482,9 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 
 	req->errors = 0;
 
-	mutex_lock(&nbd->tx_lock);
-	nbd->task_send = current;
-	if (unlikely(!nbd->sock)) {
-		mutex_unlock(&nbd->tx_lock);
+	mutex_lock(&nsock->tx_lock);
+	if (unlikely(!nsock->sock)) {
+		mutex_unlock(&nsock->tx_lock);
 		dev_err(disk_to_dev(nbd->disk),
 			"Attempted send on closed socket\n");
 		goto error_out;
@@ -509,8 +496,7 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 		nbd_end_request(cmd);
 	}
 
-	nbd->task_send = NULL;
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nsock->tx_lock);
 
 	return;
 
@@ -529,34 +515,45 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
+static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
 {
-	int ret = 0;
-
-	spin_lock_irq(&nbd->sock_lock);
+	struct nbd_sock **socks;
+	struct nbd_sock *nsock;
 
-	if (nbd->sock) {
-		ret = -EBUSY;
-		goto out;
-	}
+	socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+			 sizeof(struct nbd_sock *), GFP_KERNEL);
+	if (!socks)
+		return -ENOMEM;
+	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
+	if (!nsock)
+		return -ENOMEM;
 
-	nbd->sock = sock;
+	nbd->socks = socks;
 
-out:
-	spin_unlock_irq(&nbd->sock_lock);
+	mutex_init(&nsock->tx_lock);
+	nsock->sock = sock;
+	socks[nbd->num_connections++] = nsock;
 
-	return ret;
+	return 0;
 }
 
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
+	int i;
+
+	for (i = 0; i < nbd->num_connections; i++)
+		kfree(nbd->socks[i]);
+	kfree(nbd->socks);
+	nbd->socks = NULL;
 	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
 	nbd->tag_set.timeout = 0;
+	nbd->num_connections = 0;
+	atomic_set(&nbd->recv_threads, 0);
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
 
@@ -585,8 +582,7 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
 static int nbd_dev_dbg_init(struct nbd_device *nbd);
 static void nbd_dev_dbg_close(struct nbd_device *nbd);
 
-/* Must be called with tx_lock held */
-
+/* Must be called with config_lock held */
 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -595,27 +591,33 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		struct request *sreq;
 
 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
 
 		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
 		if (!sreq)
 			return -ENOMEM;
 
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 		fsync_bdev(bdev);
-		mutex_lock(&nbd->tx_lock);
+		mutex_lock(&nbd->config_lock);
 		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
 
 		/* Check again after getting mutex back.  */
-		if (!nbd->sock) {
+		if (!nbd->socks) {
 			blk_mq_free_request(sreq);
 			return -EINVAL;
 		}
 
 		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
 
-		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
+		/*
+		 * Since we are holding the config lock here we can't do the
+		 * timeout work, so if this fails just shutdown the socks.
+		 */
+		if (blk_execute_rq(sreq->q, NULL, sreq, 0) &&
+		    !test_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
+			sock_shutdown(nbd);
 		blk_mq_free_request(sreq);
 		return 0;
 	}
@@ -633,7 +635,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!sock)
 			return err;
 
-		err = nbd_set_socket(nbd, sock);
+		err = nbd_add_socket(nbd, sock);
 		if (!err && max_part)
 			bdev->bd_invalidated = 1;
 
@@ -662,26 +664,45 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_DO_IT: {
-		int error;
+		struct recv_thread_args *args;
+		int num_connections = nbd->num_connections;
+		int error, i;
 
 		if (nbd->task_recv)
 			return -EBUSY;
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
 
-		/* We have to claim the device under the lock */
+		blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
+		args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
+		if (!args)
+			goto out_err;
 		nbd->task_recv = current;
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 
 		nbd_parse_flags(nbd, bdev);
 
+		nbd_size_update(nbd, bdev);
+
 		nbd_dev_dbg_init(nbd);
-		error = nbd_thread_recv(nbd, bdev);
+		for (i = 0; i < num_connections; i++) {
+			sk_set_memalloc(nbd->socks[i]->sock->sk);
+			atomic_inc(&nbd->recv_threads);
+			INIT_WORK(&args[i].work, recv_work);
+			args[i].nbd = nbd;
+			args[i].index = i;
+			queue_work(system_long_wq, &args[i].work);
+		}
+		wait_event_interruptible(nbd->recv_wq,
+					 atomic_read(&nbd->recv_threads) == 0);
+		for (i = 0; i < num_connections; i++)
+			flush_work(&args[i].work);
 		nbd_dev_dbg_close(nbd);
+		nbd_size_clear(nbd, bdev);
 
-		mutex_lock(&nbd->tx_lock);
+		mutex_lock(&nbd->config_lock);
 		nbd->task_recv = NULL;
-
+out_err:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
 		kill_bdev(bdev);
@@ -726,9 +747,9 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	mutex_lock(&nbd->tx_lock);
+	mutex_lock(&nbd->config_lock);
 	error = __nbd_ioctl(bdev, nbd, cmd, arg);
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nbd->config_lock);
 
 	return error;
 }
@@ -748,8 +769,6 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 
 	if (nbd->task_recv)
 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
-	if (nbd->task_send)
-		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
 
 	return 0;
 }
@@ -875,7 +894,7 @@ static int nbd_init_request(void *data, struct request *rq,
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	cmd->nbd = data;
-	INIT_LIST_HEAD(&cmd->list);
+	cmd->index = hctx_idx;
 	return 0;
 }
 
@@ -986,13 +1005,13 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].magic = NBD_MAGIC;
-		spin_lock_init(&nbd_dev[i].sock_lock);
-		mutex_init(&nbd_dev[i].tx_lock);
+		mutex_init(&nbd_dev[i].config_lock);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
 		disk->private_data = &nbd_dev[i];
 		sprintf(disk->disk_name, "nbd%d", i);
+		init_waitqueue_head(&nbd_dev[i].recv_wq);
 		nbd_reset(&nbd_dev[i]);
 		add_disk(disk);
 	}
-- 
2.8.0.rc2


^ permalink raw reply related	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-08 21:12 ` [PATCH 3/5] nbd: use flags instead of bool Josef Bacik
@ 2016-09-09  1:20   ` Joe Perches
  2016-09-09 13:55     ` Jens Axboe
  0 siblings, 1 reply; 95+ messages in thread
From: Joe Perches @ 2016-09-09  1:20 UTC (permalink / raw)
  To: Josef Bacik, linux-block, linux-kernel, kernel-team, mpa, nbd-general

On Thu, 2016-09-08 at 17:12 -0400, Josef Bacik wrote:
> In preparation for some future changes, change a few of the state bools over to
> normal bits to set/clear properly.
[]
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
[]
> @@ -41,8 +41,12 @@
>  
>  #include <linux/nbd.h>
>  
> +#define NBD_TIMEDOUT			0
> +#define NBD_DISCONNECT_REQUESTED	1
> +
>  struct nbd_device {
>  	u32 flags;
> +	unsigned long runtime_flags;

Better to use DECLARE_BITMAP

> @@ -626,7 +627,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
>  			return -EINVAL;
>  		}
>  
> -		nbd->disconnect = true;
> +		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);

And remove the & from runtime_flags here


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-09  1:20   ` Joe Perches
@ 2016-09-09 13:55     ` Jens Axboe
  2016-09-09 16:04         ` Joe Perches
  0 siblings, 1 reply; 95+ messages in thread
From: Jens Axboe @ 2016-09-09 13:55 UTC (permalink / raw)
  To: Joe Perches, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On 09/08/2016 07:20 PM, Joe Perches wrote:
> On Thu, 2016-09-08 at 17:12 -0400, Josef Bacik wrote:
>> In preparation for some future changes, change a few of the state bools over to
>> normal bits to set/clear properly.
> []
>> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> []
>> @@ -41,8 +41,12 @@
>>
>>  #include <linux/nbd.h>
>>
>> +#define NBD_TIMEDOUT			0
>> +#define NBD_DISCONNECT_REQUESTED	1
>> +
>>  struct nbd_device {
>>  	u32 flags;
>> +	unsigned long runtime_flags;
>
> Better to use DECLARE_BITMAP

It's a few flags, we know it fits in a long. There's no point to using
anything but that, and set/test/clear_bit().

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-09 13:55     ` Jens Axboe
@ 2016-09-09 16:04         ` Joe Perches
  0 siblings, 0 replies; 95+ messages in thread
From: Joe Perches @ 2016-09-09 16:04 UTC (permalink / raw)
  To: Jens Axboe, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On Fri, 2016-09-09 at 07:55 -0600, Jens Axboe wrote:
> On 09/08/2016 07:20 PM, Joe Perches wrote:
> > On Thu, 2016-09-08 at 17:12 -0400, Josef Bacik wrote:
> > > In preparation for some future changes, change a few of the state bools over to
> > > normal bits to set/clear properly.
> > []
> > > diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> > []
> > > @@ -41,8 +41,12 @@
> > >
> > >�#include <linux/nbd.h>
> > >
> > > +#define NBD_TIMEDOUT			0
> > > +#define NBD_DISCONNECT_REQUESTED	1
> > > +
> > >�struct nbd_device {
> > >�	u32 flags;
> > +	unsigned long runtime_flags;
> > Better to use DECLARE_BITMAP

> It's a few flags, we know it fits in a long. There's no point to using
> anything but that, and set/test/clear_bit().

It lets the reader know how it's used.



^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
@ 2016-09-09 16:04         ` Joe Perches
  0 siblings, 0 replies; 95+ messages in thread
From: Joe Perches @ 2016-09-09 16:04 UTC (permalink / raw)
  To: Jens Axboe, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On Fri, 2016-09-09 at 07:55 -0600, Jens Axboe wrote:
> On 09/08/2016 07:20 PM, Joe Perches wrote:
> > On Thu, 2016-09-08 at 17:12 -0400, Josef Bacik wrote:
> > > In preparation for some future changes, change a few of the state bools over to
> > > normal bits to set/clear properly.
> > []
> > > diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> > []
> > > @@ -41,8 +41,12 @@
> > >
> > > #include <linux/nbd.h>
> > >
> > > +#define NBD_TIMEDOUT			0
> > > +#define NBD_DISCONNECT_REQUESTED	1
> > > +
> > > struct nbd_device {
> > > 	u32 flags;
> > +	unsigned long runtime_flags;
> > Better to use DECLARE_BITMAP

> It's a few flags, we know it fits in a long. There's no point to using
> anything but that, and set/test/clear_bit().

It lets the reader know how it's used.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-09 16:04         ` Joe Perches
  (?)
@ 2016-09-09 16:11         ` Jens Axboe
  2016-09-09 16:15           ` Joe Perches
  -1 siblings, 1 reply; 95+ messages in thread
From: Jens Axboe @ 2016-09-09 16:11 UTC (permalink / raw)
  To: Joe Perches, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On 09/09/2016 10:04 AM, Joe Perches wrote:
> On Fri, 2016-09-09 at 07:55 -0600, Jens Axboe wrote:
>> On 09/08/2016 07:20 PM, Joe Perches wrote:
>>> On Thu, 2016-09-08 at 17:12 -0400, Josef Bacik wrote:
>>>> In preparation for some future changes, change a few of the state bools over to
>>>> normal bits to set/clear properly.
>>> []
>>>> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
>>> []
>>>> @@ -41,8 +41,12 @@
>>>>
>>>>  #include <linux/nbd.h>
>>>>
>>>> +#define NBD_TIMEDOUT			0
>>>> +#define NBD_DISCONNECT_REQUESTED	1
>>>> +
>>>>  struct nbd_device {
>>>>  	u32 flags;
>>> +	unsigned long runtime_flags;
>>> Better to use DECLARE_BITMAP
>
>> It's a few flags, we know it fits in a long. There's no point to using
>> anything but that, and set/test/clear_bit().
>
> It lets the reader know how it's used.

The variable is called 'runtime_flags' - if that doesn't already tell
the reader how it's used, then I'd suggest the reader go read something
else.

I'm all for using established APIs where it makes sense. Declaring a
bitmap for a few fields isn't that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-09 16:11         ` Jens Axboe
@ 2016-09-09 16:15           ` Joe Perches
  2016-09-09 16:20               ` Jens Axboe
  0 siblings, 1 reply; 95+ messages in thread
From: Joe Perches @ 2016-09-09 16:15 UTC (permalink / raw)
  To: Jens Axboe, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On Fri, 2016-09-09 at 10:11 -0600, Jens Axboe wrote:
> The variable is called 'runtime_flags' - if that doesn't already tell
> the reader how it's used, then I'd suggest the reader go read something
> else.
> 
> I'm all for using established APIs where it makes sense. Declaring a
> bitmap for a few fields isn't that.

Deviating from established APIs makes no sense.



^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
  2016-09-09 16:15           ` Joe Perches
@ 2016-09-09 16:20               ` Jens Axboe
  0 siblings, 0 replies; 95+ messages in thread
From: Jens Axboe @ 2016-09-09 16:20 UTC (permalink / raw)
  To: Joe Perches, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On 09/09/2016 10:15 AM, Joe Perches wrote:
> On Fri, 2016-09-09 at 10:11 -0600, Jens Axboe wrote:
>> The variable is called 'runtime_flags' - if that doesn't already tell
>> the reader how it's used, then I'd suggest the reader go read something
>> else.
>>
>> I'm all for using established APIs where it makes sense. Declaring a
>> bitmap for a few fields isn't that.
>
> Deviating from established APIs makes no sense.

Look, doing set/test/clear bit on an unsigned long is a very well
established API. We've been doing that _forever_. As I said in my
original email, I have nothing against using bitmaps _where they make
sense_. Manually allocating an array for X number of bits (where X >
32), yes, by all means use the bitmap helpers.

This is my final reply on this topic.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 3/5] nbd: use flags instead of bool
@ 2016-09-09 16:20               ` Jens Axboe
  0 siblings, 0 replies; 95+ messages in thread
From: Jens Axboe @ 2016-09-09 16:20 UTC (permalink / raw)
  To: Joe Perches, Josef Bacik, linux-block, linux-kernel, kernel-team,
	mpa, nbd-general

On 09/09/2016 10:15 AM, Joe Perches wrote:
> On Fri, 2016-09-09 at 10:11 -0600, Jens Axboe wrote:
>> The variable is called 'runtime_flags' - if that doesn't already tell
>> the reader how it's used, then I'd suggest the reader go read something
>> else.
>>
>> I'm all for using established APIs where it makes sense. Declaring a
>> bitmap for a few fields isn't that.
>
> Deviating from established APIs makes no sense.

Look, doing set/test/clear bit on an unsigned long is a very well
established API. We've been doing that _forever_. As I said in my
original email, I have nothing against using bitmaps _where they make
sense_. Manually allocating an array for X number of bits (where X >
32), yes, by all means use the bitmap helpers.

This is my final reply on this topic.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
                   ` (4 preceding siblings ...)
  2016-09-08 21:12 ` [PATCH 5/5] nbd: add multi-connection support Josef Bacik
@ 2016-09-09 20:02 ` Wouter Verhelst
  2016-09-09 20:36   ` Josef Bacik
  2016-09-15 10:49   ` Wouter Verhelst
  5 siblings, 2 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-09 20:02 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-block, linux-kernel, kernel-team, mpa, nbd-general

Hi Josef,

On Thu, Sep 08, 2016 at 05:12:05PM -0400, Josef Bacik wrote:
> Apologies if you are getting this a second time, it appears vger ate my last
> submission.
> 
> ----------------------------------------------------------------------
> 
> This is a patch series aimed at bringing NBD into 2016.  The two big components
> of this series is converting nbd over to using blkmq and then allowing us to
> provide more than one connection for a nbd device.  The NBD user space server
> doesn't care about how many connections it has to a particular device, so we can
> easily open multiple connections to the server and allow blkmq to handle
> multi-plexing over the different connections.

I see some practical problems with this:
- You removed the pid attribute from sysfs (unless you added it back and
  I didn't notice, in which case just ignore this part). This kills
  userspace in two ways:
  - systemd/udev mark an NBD device as "not active" if the sysfs pid
    attribute is absent. Removing that attribute causes the new nbd
    systemd unit to stop working.
  - nbd-client -check relies on this attribute too, which means that
    even if people don't use systemd, their init scripts will still
    break, and vigilant sysadmins (who check before trying to connect
    something) will be surprised.
- What happens if userspace tries to connect an already-connected device
  to some other server? Currently that can't happen (you get EBUSY);
  with this patch, I believe it can, and data corruption would be the
  result (on *two* nbd devices). Additionally, with the loss of the pid
  attribute (as above) and the ensuing loss of the -check functionality,
  this might actually be a somewhat likely scenario.
- What happens if one of the multiple connections drop but the others do
  not?
- This all has the downside that userspace now has to predict how many
  parallel connections will be necessary and/or useful. If the initial
  guess was wrong, we don't have a way to correct later on.

My suggestion is to reject an additional connection unless it comes from
the same userspace process as the previous connections, and to retain
the pid attribute (since it is now guaranteed to be the same for all the
connections). That should fix the first two issues (while unfortunately
reinforcing the last one). The third would also need to have clearly
defined semantics, at the very least.

A better way, long term, would presumably be to modify the protocol to
allow multiplexing several requests in one NBD session. This would deal
with what you're trying to fix too[1], while it would not pull in all of
the above problems.

[1] after all, we have to serialize all traffic anyway, just before it
    heads into the NIC.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-09 20:02 ` [Nbd] [RESEND][PATCH 0/5] nbd improvements Wouter Verhelst
@ 2016-09-09 20:36   ` Josef Bacik
  2016-09-09 20:55     ` Wouter Verhelst
  2016-09-15 10:49   ` Wouter Verhelst
  1 sibling, 1 reply; 95+ messages in thread
From: Josef Bacik @ 2016-09-09 20:36 UTC (permalink / raw)
  To: Wouter Verhelst; +Cc: linux-block, linux-kernel, kernel-team, mpa, nbd-general

On 09/09/2016 04:02 PM, Wouter Verhelst wrote:
> Hi Josef,
>
> On Thu, Sep 08, 2016 at 05:12:05PM -0400, Josef Bacik wrote:
>> Apologies if you are getting this a second time, it appears vger ate my last
>> submission.
>>
>> ----------------------------------------------------------------------
>>
>> This is a patch series aimed at bringing NBD into 2016.  The two big components
>> of this series is converting nbd over to using blkmq and then allowing us to
>> provide more than one connection for a nbd device.  The NBD user space server
>> doesn't care about how many connections it has to a particular device, so we can
>> easily open multiple connections to the server and allow blkmq to handle
>> multi-plexing over the different connections.
>
> I see some practical problems with this:
> - You removed the pid attribute from sysfs (unless you added it back and
>   I didn't notice, in which case just ignore this part). This kills
>   userspace in two ways:
>   - systemd/udev mark an NBD device as "not active" if the sysfs pid
>     attribute is absent. Removing that attribute causes the new nbd
>     systemd unit to stop working.
>   - nbd-client -check relies on this attribute too, which means that
>     even if people don't use systemd, their init scripts will still
>     break, and vigilant sysadmins (who check before trying to connect
>     something) will be surprised.

Ok I can add this back, I didn't see anybody using it, but again I didn't look 
very hard.

> - What happens if userspace tries to connect an already-connected device
>   to some other server? Currently that can't happen (you get EBUSY);
>   with this patch, I believe it can, and data corruption would be the
>   result (on *two* nbd devices). Additionally, with the loss of the pid
>   attribute (as above) and the ensuing loss of the -check functionality,
>   this might actually be a somewhat likely scenario.

Once you do DO_IT then you'll get the EBUSY, so no problems.  Now if you modify 
the client to connect to two different servers then yes you could have data 
corruption, but hey if you do stupid things then bad things happen, I'm not sure 
we need to explicitly keep this from happening.

> - What happens if one of the multiple connections drop but the others do
>   not?

It keeps on trucking, but the connections that break will return -EIO.  That's 
not good, I'll fix it to tear down everything if that happens.

> - This all has the downside that userspace now has to predict how many
>   parallel connections will be necessary and/or useful. If the initial
>   guess was wrong, we don't have a way to correct later on.

No, it relies on the admin to specify based on their environment.

>
> My suggestion is to reject an additional connection unless it comes from
> the same userspace process as the previous connections, and to retain
> the pid attribute (since it is now guaranteed to be the same for all the
> connections). That should fix the first two issues (while unfortunately
> reinforcing the last one). The third would also need to have clearly
> defined semantics, at the very least.

Yeah that sounds reasonable to me, I hadn't thought of some other pid trying to 
setup a device at the same time.

>
> A better way, long term, would presumably be to modify the protocol to
> allow multiplexing several requests in one NBD session. This would deal
> with what you're trying to fix too[1], while it would not pull in all of
> the above problems.
>
> [1] after all, we have to serialize all traffic anyway, just before it
>     heads into the NIC.
>

Yeah I considered changing the protocol to handle multiplexing different 
requests, but that runs into trouble since we can't guarantee that each discrete 
sendmsg/recvmsg is going to atomically copy our buffer in.  We can accomplish 
this with KCM of course which is a road I went down for a little while, but then 
we have the issue of the actual data to send across, and KCM is limited to a 
certain buffer size (I don't remember what it was exactly).  This limitation is 
fine in practice I think, but I got such good performance with multiple 
connections that I threw all that work away and went with this.

Thanks for the review, I'll fix up these issues you've pointed out and resend,

Josef

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-09 20:36   ` Josef Bacik
@ 2016-09-09 20:55     ` Wouter Verhelst
  2016-09-09 23:00       ` Josef Bacik
  0 siblings, 1 reply; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-09 20:55 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-block, mpa, kernel-team, nbd-general, linux-kernel

On Fri, Sep 09, 2016 at 04:36:07PM -0400, Josef Bacik wrote:
> On 09/09/2016 04:02 PM, Wouter Verhelst wrote:
[...]
> > I see some practical problems with this:
> > - You removed the pid attribute from sysfs (unless you added it back and
> >   I didn't notice, in which case just ignore this part). This kills
> >   userspace in two ways:
> >   - systemd/udev mark an NBD device as "not active" if the sysfs pid
> >     attribute is absent. Removing that attribute causes the new nbd
> >     systemd unit to stop working.
> >   - nbd-client -check relies on this attribute too, which means that
> >     even if people don't use systemd, their init scripts will still
> >     break, and vigilant sysadmins (who check before trying to connect
> >     something) will be surprised.
> 
> Ok I can add this back, I didn't see anybody using it, but again I didn't look 
> very hard.

Thank you.

> > - What happens if userspace tries to connect an already-connected device
> >   to some other server? Currently that can't happen (you get EBUSY);
> >   with this patch, I believe it can, and data corruption would be the
> >   result (on *two* nbd devices). Additionally, with the loss of the pid
> >   attribute (as above) and the ensuing loss of the -check functionality,
> >   this might actually be a somewhat likely scenario.
> 
> Once you do DO_IT then you'll get the EBUSY, so no problems.

Oh, okay. I missed that part.

> Now if you modify the client to connect to two different servers then yes you
> could have data corruption, but hey if you do stupid things then bad things
> happen, I'm not sure we need to explicitly keep this from happening.

Yeah, totally agree there.

> > - What happens if one of the multiple connections drop but the others do
> >   not?
> 
> It keeps on trucking, but the connections that break will return -EIO.  That's 
> not good, I'll fix it to tear down everything if that happens.

Right. Alternatively, you could perhaps make it so that the lost
connection is removed, unack'd requests on that connection are resent,
and the session moves on with one less connection (unless the lost
connection is the last one, in which case we die as before). That might
be too much work and not worth it though.

> > - This all has the downside that userspace now has to predict how many
> >   parallel connections will be necessary and/or useful. If the initial
> >   guess was wrong, we don't have a way to correct later on.
> 
> No, it relies on the admin to specify based on their environment.

Sure, but I suppose it would be nice if things could dynamically grow
when needed, and/or that the admin could modify the number of
connections of an already-connected device. Then again, this might also
be too much work and not worth it.

[...]
> > A better way, long term, would presumably be to modify the protocol to
> > allow multiplexing several requests in one NBD session. This would deal
> > with what you're trying to fix too[1], while it would not pull in all of
> > the above problems.
> >
> > [1] after all, we have to serialize all traffic anyway, just before it
> >     heads into the NIC.
> 
> Yeah I considered changing the protocol to handle multiplexing different 
> requests, but that runs into trouble since we can't guarantee that each discrete 
> sendmsg/recvmsg is going to atomically copy our buffer in.  We can accomplish 
> this with KCM of course which is a road I went down for a little while, but then 
> we have the issue of the actual data to send across, and KCM is limited to a 
> certain buffer size (I don't remember what it was exactly).  This limitation is 
> fine in practice I think, but I got such good performance with multiple 
> connections that I threw all that work away and went with this.

Okay, sounds like you've given that way more thought than me, and that
that's a dead end. Never mind then.

> Thanks for the review, I'll fix up these issues you've pointed out and resend,

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-09 20:55     ` Wouter Verhelst
@ 2016-09-09 23:00       ` Josef Bacik
  2016-09-09 23:37         ` Jens Axboe
  0 siblings, 1 reply; 95+ messages in thread
From: Josef Bacik @ 2016-09-09 23:00 UTC (permalink / raw)
  To: Wouter Verhelst; +Cc: linux-block, mpa, kernel-team, nbd-general, linux-kernel

On 09/09/2016 04:55 PM, Wouter Verhelst wrote:
> On Fri, Sep 09, 2016 at 04:36:07PM -0400, Josef Bacik wrote:
>> On 09/09/2016 04:02 PM, Wouter Verhelst wrote:
> [...]
>>> I see some practical problems with this:
>>> - You removed the pid attribute from sysfs (unless you added it back and
>>>   I didn't notice, in which case just ignore this part). This kills
>>>   userspace in two ways:
>>>   - systemd/udev mark an NBD device as "not active" if the sysfs pid
>>>     attribute is absent. Removing that attribute causes the new nbd
>>>     systemd unit to stop working.
>>>   - nbd-client -check relies on this attribute too, which means that
>>>     even if people don't use systemd, their init scripts will still
>>>     break, and vigilant sysadmins (who check before trying to connect
>>>     something) will be surprised.
>>
>> Ok I can add this back, I didn't see anybody using it, but again I didn't look
>> very hard.
>
> Thank you.
>
>>> - What happens if userspace tries to connect an already-connected device
>>>   to some other server? Currently that can't happen (you get EBUSY);
>>>   with this patch, I believe it can, and data corruption would be the
>>>   result (on *two* nbd devices). Additionally, with the loss of the pid
>>>   attribute (as above) and the ensuing loss of the -check functionality,
>>>   this might actually be a somewhat likely scenario.
>>
>> Once you do DO_IT then you'll get the EBUSY, so no problems.
>
> Oh, okay. I missed that part.
>
>> Now if you modify the client to connect to two different servers then yes you
>> could have data corruption, but hey if you do stupid things then bad things
>> happen, I'm not sure we need to explicitly keep this from happening.
>
> Yeah, totally agree there.
>
>>> - What happens if one of the multiple connections drop but the others do
>>>   not?
>>
>> It keeps on trucking, but the connections that break will return -EIO.  That's
>> not good, I'll fix it to tear down everything if that happens.
>
> Right. Alternatively, you could perhaps make it so that the lost
> connection is removed, unack'd requests on that connection are resent,
> and the session moves on with one less connection (unless the lost
> connection is the last one, in which case we die as before). That might
> be too much work and not worth it though.

Yeah I wasn't sure if we could just randomly remove hw queue's in blk mq while 
the device is still up.  If that is in fact easy to do then I'm in favor of 
trucking along with less connections than we originally had, otherwise I think 
it'll be too big of a pain.  Also some users (Facebook in this case) would 
rather the whole thing fail so we can figure out what went wrong rather than 
suddenly going at a degraded performance.

>
>>> - This all has the downside that userspace now has to predict how many
>>>   parallel connections will be necessary and/or useful. If the initial
>>>   guess was wrong, we don't have a way to correct later on.
>>
>> No, it relies on the admin to specify based on their environment.
>
> Sure, but I suppose it would be nice if things could dynamically grow
> when needed, and/or that the admin could modify the number of
> connections of an already-connected device. Then again, this might also
> be too much work and not worth it.

I mean we can set some magic number, like num_connections = min(nr_cpus, 4); and 
then that way people who aren't paying attention suddenly are going faster.  I 
think anything smarter and we'd have to figure out how fast the link is and at 
what point we're hitting the diminishing returns and that path lies sadness.

>
> [...]
>>> A better way, long term, would presumably be to modify the protocol to
>>> allow multiplexing several requests in one NBD session. This would deal
>>> with what you're trying to fix too[1], while it would not pull in all of
>>> the above problems.
>>>
>>> [1] after all, we have to serialize all traffic anyway, just before it
>>>     heads into the NIC.
>>
>> Yeah I considered changing the protocol to handle multiplexing different
>> requests, but that runs into trouble since we can't guarantee that each discrete
>> sendmsg/recvmsg is going to atomically copy our buffer in.  We can accomplish
>> this with KCM of course which is a road I went down for a little while, but then
>> we have the issue of the actual data to send across, and KCM is limited to a
>> certain buffer size (I don't remember what it was exactly).  This limitation is
>> fine in practice I think, but I got such good performance with multiple
>> connections that I threw all that work away and went with this.
>
> Okay, sounds like you've given that way more thought than me, and that
> that's a dead end. Never mind then.
>

Not necessarily a dead end, but a path that requires a lot of thought and 
experimentation to figure out if it's worth it.  So maybe I'll waste an interns 
summer on figuring it out, but I've got other things I'd rather do ;).  Thanks,

Josef


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-09 23:00       ` Josef Bacik
@ 2016-09-09 23:37         ` Jens Axboe
  0 siblings, 0 replies; 95+ messages in thread
From: Jens Axboe @ 2016-09-09 23:37 UTC (permalink / raw)
  To: Josef Bacik, Wouter Verhelst
  Cc: linux-block, mpa, kernel-team, nbd-general, linux-kernel

On 09/09/2016 05:00 PM, Josef Bacik wrote:
>> Right. Alternatively, you could perhaps make it so that the lost
>> connection is removed, unack'd requests on that connection are resent,
>> and the session moves on with one less connection (unless the lost
>> connection is the last one, in which case we die as before). That might
>> be too much work and not worth it though.
>
> Yeah I wasn't sure if we could just randomly remove hw queue's in blk mq
> while the device is still up.  If that is in fact easy to do then I'm in
> favor of trucking along with less connections than we originally had,
> otherwise I think it'll be too big of a pain.  Also some users (Facebook
> in this case) would rather the whole thing fail so we can figure out
> what went wrong rather than suddenly going at a degraded performance.

We can do that online. We do that for CPU hotplug/unplug events, and we
also expose that functionality to drivers through
blk_mq_update_nr_hw_queues(). So yes, should be trivial to support from
nbd.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 5/5] nbd: add multi-connection support
  2016-09-08 21:12 ` [PATCH 5/5] nbd: add multi-connection support Josef Bacik
@ 2016-09-10  7:43   ` Christoph Hellwig
  2016-09-12 13:11     ` Josef Bacik
  0 siblings, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-10  7:43 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-block, linux-kernel, kernel-team, mpa, nbd-general

Hi Josef,

I haven't read the full path as I'm a bit in a hurry, but is there
a good reason to not simply have a socket per-hw_ctx and store it in
the hw_ctx private data instead of using the index in the nbd_cmd
structure?

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH 5/5] nbd: add multi-connection support
  2016-09-10  7:43   ` Christoph Hellwig
@ 2016-09-12 13:11     ` Josef Bacik
  0 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-12 13:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, linux-kernel, kernel-team, mpa, nbd-general

On 09/10/2016 03:43 AM, Christoph Hellwig wrote:
> Hi Josef,
>
> I haven't read the full path as I'm a bit in a hurry, but is there
> a good reason to not simply have a socket per-hw_ctx and store it in
> the hw_ctx private data instead of using the index in the nbd_cmd
> structure?
>

No good reason, just didn't know I could do that.  I'll fix things up to do that 
instead, thanks,

Josef

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-09 20:02 ` [Nbd] [RESEND][PATCH 0/5] nbd improvements Wouter Verhelst
  2016-09-09 20:36   ` Josef Bacik
@ 2016-09-15 10:49   ` Wouter Verhelst
  2016-09-15 11:09       ` Alex Bligh
  2016-09-15 11:38     ` Christoph Hellwig
  1 sibling, 2 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 10:49 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-block, mpa, kernel-team, nbd-general, linux-kernel

Hi,

On Fri, Sep 09, 2016 at 10:02:03PM +0200, Wouter Verhelst wrote:
> I see some practical problems with this:
[...]

One more that I didn't think about earlier:

A while back, we spent quite some time defining the semantics of the
various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
write barriers. At the time, we decided that it would be unreasonable
to expect servers to make these write barriers effective across
different connections.

Since my knowledge of kernel internals is limited, I tried finding some
documentation on this, but I guess that either it doesn't exist or I'm
looking in the wrong place; therefore, am I correct in assuming that
blk-mq knows about such semantics, and will handle them correctly (by
either sending a write barrier to all queues, or not making assumptions
about write barriers that were sent over a different queue)? If not,
this may be something that needs to be taken care of.

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 10:49   ` Wouter Verhelst
@ 2016-09-15 11:09       ` Alex Bligh
  2016-09-15 11:38     ` Christoph Hellwig
  1 sibling, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:09 UTC (permalink / raw)
  To: Wouter Verhelst, Josef Bacik
  Cc: Alex Bligh, linux-block, Markus Pargmann, kernel-team,
	nbd-general, linux-kernel, Eric Blake

Wouter, Josef, (& Eric)

> On 15 Sep 2016, at 11:49, Wouter Verhelst <w@uter.be> wrote:
> 
> Hi,
> 
> On Fri, Sep 09, 2016 at 10:02:03PM +0200, Wouter Verhelst wrote:
>> I see some practical problems with this:
> [...]
> 
> One more that I didn't think about earlier:
> 
> A while back, we spent quite some time defining the semantics of the
> various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> write barriers. At the time, we decided that it would be unreasonable
> to expect servers to make these write barriers effective across
> different connections.

Actually I wonder if there is a wider problem in that implementations
might mediate access to a device by presence of an extant TCP connection,
i.e. only permit one TCP connection to access a given block device at
once. If you think about (for instance) a forking daemon that does
writeback caching, that would be an entirely reasonable thing to do
for data consistency.

I also wonder whether any servers that can do caching per
connection will always share a consistent cache between 
connections. The one I'm worried about in particular here
is qemu-nbd - Eric Blake CC'd.

A more general point is that with multiple queues requests
may be processed in a different order even by those servers that
currently process the requests in strict order, or in something
similar to strict order. The server is permitted by the spec
(save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
process commands out of order anyway, but I suspect this has
to date been little tested.

Lastly I confess to lack of familiarity with the kernel side
code, but how is NBD_CMD_DISCONNECT synchronised across
each of the connections? Presumably you need to send it on
each channel, but cannot assume the NBD connection as a whole
is dead until the last tcp connection has closed?

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:09       ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:09 UTC (permalink / raw)
  To: Wouter Verhelst, Josef Bacik
  Cc: Alex Bligh, linux-block, Markus Pargmann, kernel-team,
	nbd-general, linux-kernel, Eric Blake

Wouter, Josef, (& Eric)

> On 15 Sep 2016, at 11:49, Wouter Verhelst <w@uter.be> wrote:
> 
> Hi,
> 
> On Fri, Sep 09, 2016 at 10:02:03PM +0200, Wouter Verhelst wrote:
>> I see some practical problems with this:
> [...]
> 
> One more that I didn't think about earlier:
> 
> A while back, we spent quite some time defining the semantics of the
> various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> write barriers. At the time, we decided that it would be unreasonable
> to expect servers to make these write barriers effective across
> different connections.

Actually I wonder if there is a wider problem in that implementations
might mediate access to a device by presence of an extant TCP connection,
i.e. only permit one TCP connection to access a given block device at
once. If you think about (for instance) a forking daemon that does
writeback caching, that would be an entirely reasonable thing to do
for data consistency.

I also wonder whether any servers that can do caching per
connection will always share a consistent cache between 
connections. The one I'm worried about in particular here
is qemu-nbd - Eric Blake CC'd.

A more general point is that with multiple queues requests
may be processed in a different order even by those servers that
currently process the requests in strict order, or in something
similar to strict order. The server is permitted by the spec
(save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
process commands out of order anyway, but I suspect this has
to date been little tested.

Lastly I confess to lack of familiarity with the kernel side
code, but how is NBD_CMD_DISCONNECT synchronised across
each of the connections? Presumably you need to send it on
each channel, but cannot assume the NBD connection as a whole
is dead until the last tcp connection has closed?

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:09       ` Alex Bligh
@ 2016-09-15 11:29         ` Wouter Verhelst
  -1 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 11:29 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Josef Bacik, nbd-general, linux-kernel, linux-block,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:09:28PM +0100, Alex Bligh wrote:
> Wouter, Josef, (& Eric)
> 
> > On 15 Sep 2016, at 11:49, Wouter Verhelst <w@uter.be> wrote:
> > 
> > Hi,
> > 
> > On Fri, Sep 09, 2016 at 10:02:03PM +0200, Wouter Verhelst wrote:
> >> I see some practical problems with this:
> > [...]
> > 
> > One more that I didn't think about earlier:
> > 
> > A while back, we spent quite some time defining the semantics of the
> > various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> > write barriers. At the time, we decided that it would be unreasonable
> > to expect servers to make these write barriers effective across
> > different connections.
> 
> Actually I wonder if there is a wider problem in that implementations
> might mediate access to a device by presence of an extant TCP connection,
> i.e. only permit one TCP connection to access a given block device at
> once. If you think about (for instance) a forking daemon that does
> writeback caching, that would be an entirely reasonable thing to do
> for data consistency.

Sure. They will have to live with the fact that clients connected to
them will run slower; I don't think that's a problem. In addition,
Josef's client implementation requires the user to explicitly ask for
multiple connections.

There are multiple contexts in which NBD can be used, and in some
performance is more important than in others. I think that is fine.

[...]
> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

Yes, and that is why I was asking about this. If the write barriers
are expected to be shared across connections, we have a problem. If,
however, they are not, then it doesn't matter that the commands may be
processed out of order.

[...]

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:29         ` Wouter Verhelst
  0 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 11:29 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Josef Bacik, nbd-general, linux-kernel, linux-block,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:09:28PM +0100, Alex Bligh wrote:
> Wouter, Josef, (& Eric)
> 
> > On 15 Sep 2016, at 11:49, Wouter Verhelst <w@uter.be> wrote:
> > 
> > Hi,
> > 
> > On Fri, Sep 09, 2016 at 10:02:03PM +0200, Wouter Verhelst wrote:
> >> I see some practical problems with this:
> > [...]
> > 
> > One more that I didn't think about earlier:
> > 
> > A while back, we spent quite some time defining the semantics of the
> > various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> > write barriers. At the time, we decided that it would be unreasonable
> > to expect servers to make these write barriers effective across
> > different connections.
> 
> Actually I wonder if there is a wider problem in that implementations
> might mediate access to a device by presence of an extant TCP connection,
> i.e. only permit one TCP connection to access a given block device at
> once. If you think about (for instance) a forking daemon that does
> writeback caching, that would be an entirely reasonable thing to do
> for data consistency.

Sure. They will have to live with the fact that clients connected to
them will run slower; I don't think that's a problem. In addition,
Josef's client implementation requires the user to explicitly ask for
multiple connections.

There are multiple contexts in which NBD can be used, and in some
performance is more important than in others. I think that is fine.

[...]
> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

Yes, and that is why I was asking about this. If the write barriers
are expected to be shared across connections, we have a problem. If,
however, they are not, then it doesn't matter that the commands may be
processed out of order.

[...]

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 10:49   ` Wouter Verhelst
  2016-09-15 11:09       ` Alex Bligh
@ 2016-09-15 11:38     ` Christoph Hellwig
  2016-09-15 11:43         ` Alex Bligh
  2016-09-15 11:55       ` Wouter Verhelst
  1 sibling, 2 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:38 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Josef Bacik, linux-block, mpa, kernel-team, nbd-general, linux-kernel

On Thu, Sep 15, 2016 at 12:49:35PM +0200, Wouter Verhelst wrote:
> A while back, we spent quite some time defining the semantics of the
> various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> write barriers. At the time, we decided that it would be unreasonable
> to expect servers to make these write barriers effective across
> different connections.

Do you have a nbd protocol specification?  treating a flush or fua
as any sort of barrier is incredibly stupid.  Is it really documented
that way, and if yes, why?

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:09       ` Alex Bligh
@ 2016-09-15 11:39         ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:39 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Wouter Verhelst, Josef Bacik, linux-block, Markus Pargmann,
	kernel-team, nbd-general, linux-kernel, Eric Blake

On Thu, Sep 15, 2016 at 12:09:28PM +0100, Alex Bligh wrote:
> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

The Linux kernel does not assume any synchroniztion between block I/O
commands.  So any sort of synchronization a protocol does is complete
overkill for us.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:39         ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:39 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Wouter Verhelst, Josef Bacik, linux-block, Markus Pargmann,
	kernel-team, nbd-general, linux-kernel, Eric Blake

On Thu, Sep 15, 2016 at 12:09:28PM +0100, Alex Bligh wrote:
> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

The Linux kernel does not assume any synchroniztion between block I/O
commands.  So any sort of synchronization a protocol does is complete
overkill for us.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:29         ` Wouter Verhelst
@ 2016-09-15 11:40           ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:40 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Josef Bacik, nbd-general, linux-kernel, linux-block,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 01:29:36PM +0200, Wouter Verhelst wrote:
> Yes, and that is why I was asking about this. If the write barriers
> are expected to be shared across connections, we have a problem. If,
> however, they are not, then it doesn't matter that the commands may be
> processed out of order.

There is no such thing as a write barrier in the Linux kernel.  We'd
much prefer protocols not to introduce any pointless synchronization
if we can avoid it.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:40           ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:40 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Josef Bacik, nbd-general, linux-kernel, linux-block,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 01:29:36PM +0200, Wouter Verhelst wrote:
> Yes, and that is why I was asking about this. If the write barriers
> are expected to be shared across connections, we have a problem. If,
> however, they are not, then it doesn't matter that the commands may be
> processed out of order.

There is no such thing as a write barrier in the Linux kernel.  We'd
much prefer protocols not to introduce any pointless synchronization
if we can avoid it.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:38     ` Christoph Hellwig
@ 2016-09-15 11:43         ` Alex Bligh
  2016-09-15 11:55       ` Wouter Verhelst
  1 sibling, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

Christoph,

> On 15 Sep 2016, at 12:38, Christoph Hellwig <hch@infradead.org> wrote:
>=20
> On Thu, Sep 15, 2016 at 12:49:35PM +0200, Wouter Verhelst wrote:
>> A while back, we spent quite some time defining the semantics of the
>> various commands in the face of the NBD_CMD_FLUSH and =
NBD_CMD_FLAG_FUA
>> write barriers. At the time, we decided that it would be unreasonable
>> to expect servers to make these write barriers effective across
>> different connections.
>=20
> Do you have a nbd protocol specification?  treating a flush or fua
> as any sort of barrier is incredibly stupid.  Is it really documented
> that way, and if yes, why?

Sure, it's at:

=
https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-a=
nd-writes

and that link takes you to the specific section.

The treatment of FLUSH and FUA is meant to mirror exactly the
linux block layer (or rather how the linux block layer was a few
years ago). I even asked on LKML to verify a few points.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:43         ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

Christoph,

> On 15 Sep 2016, at 12:38, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 12:49:35PM +0200, Wouter Verhelst wrote:
>> A while back, we spent quite some time defining the semantics of the
>> various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
>> write barriers. At the time, we decided that it would be unreasonable
>> to expect servers to make these write barriers effective across
>> different connections.
> 
> Do you have a nbd protocol specification?  treating a flush or fua
> as any sort of barrier is incredibly stupid.  Is it really documented
> that way, and if yes, why?

Sure, it's at:

https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes

and that link takes you to the specific section.

The treatment of FLUSH and FUA is meant to mirror exactly the
linux block layer (or rather how the linux block layer was a few
years ago). I even asked on LKML to verify a few points.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:40           ` Christoph Hellwig
@ 2016-09-15 11:46             ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:40, Christoph Hellwig <hch@infradead.org> wrote:
>=20
> On Thu, Sep 15, 2016 at 01:29:36PM +0200, Wouter Verhelst wrote:
>> Yes, and that is why I was asking about this. If the write barriers
>> are expected to be shared across connections, we have a problem. If,
>> however, they are not, then it doesn't matter that the commands may =
be
>> processed out of order.
>=20
> There is no such thing as a write barrier in the Linux kernel.  We'd
> much prefer protocols not to introduce any pointless synchronization
> if we can avoid it.

I suspect the issue is terminological.

Essentially NBD does supports FLUSH/FUA like this:

=
https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt=


IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).

Link to protocol (per last email) here:

=
https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-a=
nd-writes

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:46             ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:40, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 01:29:36PM +0200, Wouter Verhelst wrote:
>> Yes, and that is why I was asking about this. If the write barriers
>> are expected to be shared across connections, we have a problem. If,
>> however, they are not, then it doesn't matter that the commands may be
>> processed out of order.
> 
> There is no such thing as a write barrier in the Linux kernel.  We'd
> much prefer protocols not to introduce any pointless synchronization
> if we can avoid it.

I suspect the issue is terminological.

Essentially NBD does supports FLUSH/FUA like this:

https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt

IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).

Link to protocol (per last email) here:

https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:43         ` Alex Bligh
@ 2016-09-15 11:46           ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:46 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:43:35PM +0100, Alex Bligh wrote:
> Sure, it's at:
> 
> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
> 
> and that link takes you to the specific section.
> 
> The treatment of FLUSH and FUA is meant to mirror exactly the
> linux block layer (or rather how the linux block layer was a few
> years ago). I even asked on LKML to verify a few points.

Linux never expected ordering on the wire.  Before 2010 we had barriers
in the kernel that provided ordering to the caller, but we never
required it from the protocol / hardware.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:46           ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:46 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:43:35PM +0100, Alex Bligh wrote:
> Sure, it's at:
> 
> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
> 
> and that link takes you to the specific section.
> 
> The treatment of FLUSH and FUA is meant to mirror exactly the
> linux block layer (or rather how the linux block layer was a few
> years ago). I even asked on LKML to verify a few points.

Linux never expected ordering on the wire.  Before 2010 we had barriers
in the kernel that provided ordering to the caller, but we never
required it from the protocol / hardware.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:46             ` Alex Bligh
@ 2016-09-15 11:52               ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:52 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
> Essentially NBD does supports FLUSH/FUA like this:
> 
> https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt
> 
> IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).
> 
> Link to protocol (per last email) here:
> 
> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes

Flush as defined by the Linux block layer (and supported that way in
SCSI, ATA, NVMe) only requires to flush all already completed writes
to non-volatile media.  It does not impose any ordering unlike the
nbd spec.

FUA as defined by the Linux block layer (and supported that way in SCSI,
ATA, NVMe) only requires the write operation the FUA bit is set on to be
on non-volatile media before completing the write operation.  It does
not impose any ordering, which seems to match the nbd spec.  Unlike the
NBD spec Linux does not allow FUA to be set on anything by WRITE
commands.  Some other storage protocols allow a FUA bit on READ
commands or other commands that write data to the device, though.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:52               ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 11:52 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
> Essentially NBD does supports FLUSH/FUA like this:
> 
> https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt
> 
> IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).
> 
> Link to protocol (per last email) here:
> 
> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes

Flush as defined by the Linux block layer (and supported that way in
SCSI, ATA, NVMe) only requires to flush all already completed writes
to non-volatile media.  It does not impose any ordering unlike the
nbd spec.

FUA as defined by the Linux block layer (and supported that way in SCSI,
ATA, NVMe) only requires the write operation the FUA bit is set on to be
on non-volatile media before completing the write operation.  It does
not impose any ordering, which seems to match the nbd spec.  Unlike the
NBD spec Linux does not allow FUA to be set on anything by WRITE
commands.  Some other storage protocols allow a FUA bit on READ
commands or other commands that write data to the device, though.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:38     ` Christoph Hellwig
  2016-09-15 11:43         ` Alex Bligh
@ 2016-09-15 11:55       ` Wouter Verhelst
  2016-09-15 12:01         ` Christoph Hellwig
  1 sibling, 1 reply; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 11:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, Josef Bacik, linux-kernel, linux-block, mpa, kernel-team

On Thu, Sep 15, 2016 at 04:38:07AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 12:49:35PM +0200, Wouter Verhelst wrote:
> > A while back, we spent quite some time defining the semantics of the
> > various commands in the face of the NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA
> > write barriers. At the time, we decided that it would be unreasonable
> > to expect servers to make these write barriers effective across
> > different connections.
> 
> Do you have a nbd protocol specification?

https://github.com/yoe/nbd/blob/master/doc/proto.md

> treating a flush or fua as any sort of barrier is incredibly stupid.

Maybe I'm not using the correct terminology here. The point is that
after a FLUSH, the server asserts that all write commands *for which a
reply has already been sent to the client* will also have reached
permanent storage. Nothing is asserted about commands for which the
reply has not yet been sent, regardless of whether they were sent to the
server before or after the FLUSH; they may reach permanent storage as a
result of the FLUSH, or they may not, we don't say.

With FUA, we only assert that the FUA-flagged command reaches permanent
storage before its reply is sent, nothing else.

If that's not a write barrier, then I was using the wrong terminology
(and offer my apologies for the confusion).

The point still remains that "X was sent before Y" is difficult to
determine on the client side if X was sent over a different TCP channel
than Y, because a packet might be dropped (requiring a retransmit) for
X, and similar things. If blk-mq can deal with that, we're good and
nothing further needs to be done. If not, this should be evaluated by
someone more familiar with the internals of the kernel block subsystem
than me.

> Is it really documented that way, and if yes, why?

The documentation does use the term write barrier, but only right before
the same explanation as above. It does so because I assumed that that is
what a write barrier is, and that this would make things easier to
understand. If you tell me that that term is wrong as used there, I can
easily remove it (it's not critical to the rest of the documentation).

Regards,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:46           ` Christoph Hellwig
@ 2016-09-15 11:56             ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:46, Christoph Hellwig <hch@infradead.org> wrote:
>=20
> On Thu, Sep 15, 2016 at 12:43:35PM +0100, Alex Bligh wrote:
>> Sure, it's at:
>>=20
>> =
https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-a=
nd-writes
>>=20
>> and that link takes you to the specific section.
>>=20
>> The treatment of FLUSH and FUA is meant to mirror exactly the
>> linux block layer (or rather how the linux block layer was a few
>> years ago). I even asked on LKML to verify a few points.
>=20
> Linux never expected ordering on the wire.  Before 2010 we had =
barriers
> in the kernel that provided ordering to the caller, but we never
> required it from the protocol / hardware.

Sure. And I think the doc section reflects exactly Linux's post-2010
expectations (note the link to the kernel documentation). IE
servers are not required to order reads/writes (save that all
writes that the server completes prior to reply to an NBD_CMD_FLUSH
must be persisted prior to the reply to that NBD_CMD_FLUSH
which I believe to be exactly the same behaviour as the kernel
dealing with an empty bio with REQ_FLUSH set).

Perhaps the section should be called "No ordering of messages
and writes"!

My point was that *in practice* disordering is not well tested
as *in practice* many server implementations do in fact process
each command in order, though that's changed recently.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 11:56             ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 11:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, Josef Bacik,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:46, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 12:43:35PM +0100, Alex Bligh wrote:
>> Sure, it's at:
>> 
>> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
>> 
>> and that link takes you to the specific section.
>> 
>> The treatment of FLUSH and FUA is meant to mirror exactly the
>> linux block layer (or rather how the linux block layer was a few
>> years ago). I even asked on LKML to verify a few points.
> 
> Linux never expected ordering on the wire.  Before 2010 we had barriers
> in the kernel that provided ordering to the caller, but we never
> required it from the protocol / hardware.

Sure. And I think the doc section reflects exactly Linux's post-2010
expectations (note the link to the kernel documentation). IE
servers are not required to order reads/writes (save that all
writes that the server completes prior to reply to an NBD_CMD_FLUSH
must be persisted prior to the reply to that NBD_CMD_FLUSH
which I believe to be exactly the same behaviour as the kernel
dealing with an empty bio with REQ_FLUSH set).

Perhaps the section should be called "No ordering of messages
and writes"!

My point was that *in practice* disordering is not well tested
as *in practice* many server implementations do in fact process
each command in order, though that's changed recently.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:55       ` Wouter Verhelst
@ 2016-09-15 12:01         ` Christoph Hellwig
  2016-09-15 12:11             ` Alex Bligh
  2016-09-15 12:21           ` Wouter Verhelst
  0 siblings, 2 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:01 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, Josef Bacik, linux-kernel,
	linux-block, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:55:14PM +0200, Wouter Verhelst wrote:
> Maybe I'm not using the correct terminology here. The point is that
> after a FLUSH, the server asserts that all write commands *for which a
> reply has already been sent to the client* will also have reached
> permanent storage. Nothing is asserted about commands for which the
> reply has not yet been sent, regardless of whether they were sent to the
> server before or after the FLUSH; they may reach permanent storage as a
> result of the FLUSH, or they may not, we don't say.
> 
> With FUA, we only assert that the FUA-flagged command reaches permanent
> storage before its reply is sent, nothing else.

Yes, that's the correct semantics.

> If that's not a write barrier, then I was using the wrong terminology
> (and offer my apologies for the confusion).

It's not a write barrier - a write barrier was command that ensured that

 a) all previous writes were completed to the host/client
 b) all previous writes were on non-volatile storage

and

 c) the actual write with the barrier bit was on non-volatile storage

> The point still remains that "X was sent before Y" is difficult to
> determine on the client side if X was sent over a different TCP channel
> than Y, because a packet might be dropped (requiring a retransmit) for
> X, and similar things. If blk-mq can deal with that, we're good and
> nothing further needs to be done. If not, this should be evaluated by
> someone more familiar with the internals of the kernel block subsystem
> than me.

The important bit in all the existing protocols, and which Linux relies
on is that any write the Linux block layer got a completion for earlier
needs to be flushed out to non-volatile storage when a FLUSH command is
set.  Anything that still is in flight does not matter.  Which for
NBD means anything that you already replies to need to be flushed.

Or to say it more practicly - in the nbd server you simply need to
call fdatasync on the backing device or file whenever you get a FLUSH
requires, and it will do the right thing.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:52               ` Christoph Hellwig
@ 2016-09-15 12:01                 ` Wouter Verhelst
  -1 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 12:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, nbd-general, linux-block, Josef Bacik, linux-kernel,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 04:52:17AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
> > Essentially NBD does supports FLUSH/FUA like this:
> > 
> > https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt
> > 
> > IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).
> > 
> > Link to protocol (per last email) here:
> > 
> > https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
> 
> Flush as defined by the Linux block layer (and supported that way in
> SCSI, ATA, NVMe) only requires to flush all already completed writes
> to non-volatile media.

That is precisely what FLUSH in nbd does, too.

> It does not impose any ordering unlike the nbd spec.

If you read the spec differently, then that's a bug in the spec. Can you
clarify which part of it caused that confusion? We should fix it, then.

> FUA as defined by the Linux block layer (and supported that way in SCSI,
> ATA, NVMe) only requires the write operation the FUA bit is set on to be
> on non-volatile media before completing the write operation.  It does
> not impose any ordering, which seems to match the nbd spec.  Unlike the
> NBD spec Linux does not allow FUA to be set on anything by WRITE
> commands.  Some other storage protocols allow a FUA bit on READ
> commands or other commands that write data to the device, though.

Yes. There was some discussion on that part, and we decided that setting
the flag doesn't hurt, but the spec also clarifies that using it on READ
does nothing, semantically.

The problem is that there are clients in the wild which do set it on
READ, so it's just a matter of "be liberal in what you accept".

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:01                 ` Wouter Verhelst
  0 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 12:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, nbd-general, linux-block, Josef Bacik, linux-kernel,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 04:52:17AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
> > Essentially NBD does supports FLUSH/FUA like this:
> > 
> > https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt
> > 
> > IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).
> > 
> > Link to protocol (per last email) here:
> > 
> > https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
> 
> Flush as defined by the Linux block layer (and supported that way in
> SCSI, ATA, NVMe) only requires to flush all already completed writes
> to non-volatile media.

That is precisely what FLUSH in nbd does, too.

> It does not impose any ordering unlike the nbd spec.

If you read the spec differently, then that's a bug in the spec. Can you
clarify which part of it caused that confusion? We should fix it, then.

> FUA as defined by the Linux block layer (and supported that way in SCSI,
> ATA, NVMe) only requires the write operation the FUA bit is set on to be
> on non-volatile media before completing the write operation.  It does
> not impose any ordering, which seems to match the nbd spec.  Unlike the
> NBD spec Linux does not allow FUA to be set on anything by WRITE
> commands.  Some other storage protocols allow a FUA bit on READ
> commands or other commands that write data to the device, though.

Yes. There was some discussion on that part, and we decided that setting
the flag doesn't hurt, but the spec also clarifies that using it on READ
does nothing, semantically.

The problem is that there are clients in the wild which do set it on
READ, so it's just a matter of "be liberal in what you accept".

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:52               ` Christoph Hellwig
@ 2016-09-15 12:04                 ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:52, Christoph Hellwig <hch@infradead.org> wrote:
>=20
> On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
>> Essentially NBD does supports FLUSH/FUA like this:
>>=20
>> =
https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt=

>>=20
>> IE supports the same FLUSH/FUA primitives as other block drivers =
(AIUI).
>>=20
>> Link to protocol (per last email) here:
>>=20
>> =
https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-a=
nd-writes
>=20
> Flush as defined by the Linux block layer (and supported that way in
> SCSI, ATA, NVMe) only requires to flush all already completed writes
> to non-volatile media.  It does not impose any ordering unlike the
> nbd spec.

As maintainer of the NBD spec, I'm confused as to why you think it
imposes any ordering - if you think this, clearly I need to clean up
the wording.

Here's what it says:

> The server MAY process commands out of order, and MAY reply out of =
order,
> except that:
>=20
> 	=E2=80=A2 All write commands (that includes NBD_CMD_WRITE, and =
NBD_CMD_TRIM)
> that the server completes (i.e. replies to) prior to processing to a
> NBD_CMD_FLUSH MUST be written to non-volatile storage prior to =
replying to that
> NBD_CMD_FLUSH. This paragraph only applies if NBD_FLAG_SEND_FLUSH is =
set within
> the transmission flags, as otherwise NBD_CMD_FLUSH will never be sent =
by the
> client to the server.

(and another bit re FUA that isn't relevant here).

Here's the Linux Kernel documentation:

> The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio =
submitted from
> the filesystem and will make sure the volatile cache of the storage =
device
> has been flushed before the actual I/O operation is started.  This =
explicitly
> guarantees that previously completed write requests are on =
non-volatile
> storage before the flagged bio starts. In addition the REQ_PREFLUSH =
flag can be
> set on an otherwise empty bio structure, which causes only an explicit =
cache
> flush without any dependent I/O.  It is recommend to use
> the blkdev_issue_flush() helper for a pure cache flush.


I believe that NBD treats NBD_CMD_FLUSH the same as a REQ_PREFLUSH and =
empty
bio.

If you don't read those two as compatible, I'd like to understand why =
not
(i.e. what additional constraints one is applying that the other is not)
as they are meant to be the same (save that NBD only has FLUSH as a =
command,
i.e. the 'empty bio' version). I am happy to improve the docs to make it
clearer.

(sidenote: I am interested in the change from REQ_FLUSH to REQ_PREFLUSH,
but in an empty bio it's not really relevant I think).

> FUA as defined by the Linux block layer (and supported that way in =
SCSI,
> ATA, NVMe) only requires the write operation the FUA bit is set on to =
be
> on non-volatile media before completing the write operation.  It does
> not impose any ordering, which seems to match the nbd spec.  Unlike =
the
> NBD spec Linux does not allow FUA to be set on anything by WRITE
> commands.  Some other storage protocols allow a FUA bit on READ
> commands or other commands that write data to the device, though.

I think you mean "anything *but* WRITE commands". In NBD setting
FUA on a command that does not write will do nothing, but FUA can
be set on NBD_CMD_TRIM and has the expected effect.

Interestingly the kernel docs are silent on which commands REQ_FUA
can be set on.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:04                 ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 12:52, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 12:46:07PM +0100, Alex Bligh wrote:
>> Essentially NBD does supports FLUSH/FUA like this:
>> 
>> https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt
>> 
>> IE supports the same FLUSH/FUA primitives as other block drivers (AIUI).
>> 
>> Link to protocol (per last email) here:
>> 
>> https://github.com/yoe/nbd/blob/master/doc/proto.md#ordering-of-messages-and-writes
> 
> Flush as defined by the Linux block layer (and supported that way in
> SCSI, ATA, NVMe) only requires to flush all already completed writes
> to non-volatile media.  It does not impose any ordering unlike the
> nbd spec.

As maintainer of the NBD spec, I'm confused as to why you think it
imposes any ordering - if you think this, clearly I need to clean up
the wording.

Here's what it says:

> The server MAY process commands out of order, and MAY reply out of order,
> except that:
> 
> 	• All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM)
> that the server completes (i.e. replies to) prior to processing to a
> NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that
> NBD_CMD_FLUSH. This paragraph only applies if NBD_FLAG_SEND_FLUSH is set within
> the transmission flags, as otherwise NBD_CMD_FLUSH will never be sent by the
> client to the server.

(and another bit re FUA that isn't relevant here).

Here's the Linux Kernel documentation:

> The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
> the filesystem and will make sure the volatile cache of the storage device
> has been flushed before the actual I/O operation is started.  This explicitly
> guarantees that previously completed write requests are on non-volatile
> storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
> set on an otherwise empty bio structure, which causes only an explicit cache
> flush without any dependent I/O.  It is recommend to use
> the blkdev_issue_flush() helper for a pure cache flush.


I believe that NBD treats NBD_CMD_FLUSH the same as a REQ_PREFLUSH and empty
bio.

If you don't read those two as compatible, I'd like to understand why not
(i.e. what additional constraints one is applying that the other is not)
as they are meant to be the same (save that NBD only has FLUSH as a command,
i.e. the 'empty bio' version). I am happy to improve the docs to make it
clearer.

(sidenote: I am interested in the change from REQ_FLUSH to REQ_PREFLUSH,
but in an empty bio it's not really relevant I think).

> FUA as defined by the Linux block layer (and supported that way in SCSI,
> ATA, NVMe) only requires the write operation the FUA bit is set on to be
> on non-volatile media before completing the write operation.  It does
> not impose any ordering, which seems to match the nbd spec.  Unlike the
> NBD spec Linux does not allow FUA to be set on anything by WRITE
> commands.  Some other storage protocols allow a FUA bit on READ
> commands or other commands that write data to the device, though.

I think you mean "anything *but* WRITE commands". In NBD setting
FUA on a command that does not write will do nothing, but FUA can
be set on NBD_CMD_TRIM and has the expected effect.

Interestingly the kernel docs are silent on which commands REQ_FUA
can be set on.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:01         ` Christoph Hellwig
@ 2016-09-15 12:11             ` Alex Bligh
  2016-09-15 12:21           ` Wouter Verhelst
  1 sibling, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

Christoph,

> It's not a write barrier - a write barrier was command that ensured =
that
>=20
> a) all previous writes were completed to the host/client
> b) all previous writes were on non-volatile storage
>=20
> and
>=20
> c) the actual write with the barrier bit was on non-volatile storage

Ah! the bit you are complaining about is not the bit I pointed to you, =
but:

> NBD_CMD_FLUSH (3)
>=20
> A flush request; a write barrier.=20

I can see that's potentially confusing as isn't meant to mean 'an =
old-style
linux kernel block device write barrier'. I think in general terms it
probably is some form of barrier, but I see no problem in deleting the
words "a write barrier" from the spec text if only to make it
clearer. However, I think the description of the command itself:

> The server MUST NOT send a successful reply header for this request =
before all write requests for which a reply has already been sent to the =
client have reached permanent storage (using fsync() or similar).

and the ordering section I pointed you to before, were both correct, =
yes?


>> The point still remains that "X was sent before Y" is difficult to
>> determine on the client side if X was sent over a different TCP =
channel
>> than Y, because a packet might be dropped (requiring a retransmit) =
for
>> X, and similar things. If blk-mq can deal with that, we're good and
>> nothing further needs to be done. If not, this should be evaluated by
>> someone more familiar with the internals of the kernel block =
subsystem
>> than me.
>=20
> The important bit in all the existing protocols, and which Linux =
relies
> on is that any write the Linux block layer got a completion for =
earlier
> needs to be flushed out to non-volatile storage when a FLUSH command =
is
> set.  Anything that still is in flight does not matter.  Which for
> NBD means anything that you already replies to need to be flushed.

... that's what it says (I hope).

> Or to say it more practicly - in the nbd server you simply need to
> call fdatasync on the backing device or file whenever you get a FLUSH
> requires, and it will do the right thing.

actually fdatasync() technically does more than is necessary, as it
will also flush commands that have been processed, but for which no
reply has yet been sent - that's no bad thing.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:11             ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

Christoph,

> It's not a write barrier - a write barrier was command that ensured that
> 
> a) all previous writes were completed to the host/client
> b) all previous writes were on non-volatile storage
> 
> and
> 
> c) the actual write with the barrier bit was on non-volatile storage

Ah! the bit you are complaining about is not the bit I pointed to you, but:

> NBD_CMD_FLUSH (3)
> 
> A flush request; a write barrier. 

I can see that's potentially confusing as isn't meant to mean 'an old-style
linux kernel block device write barrier'. I think in general terms it
probably is some form of barrier, but I see no problem in deleting the
words "a write barrier" from the spec text if only to make it
clearer. However, I think the description of the command itself:

> The server MUST NOT send a successful reply header for this request before all write requests for which a reply has already been sent to the client have reached permanent storage (using fsync() or similar).

and the ordering section I pointed you to before, were both correct, yes?


>> The point still remains that "X was sent before Y" is difficult to
>> determine on the client side if X was sent over a different TCP channel
>> than Y, because a packet might be dropped (requiring a retransmit) for
>> X, and similar things. If blk-mq can deal with that, we're good and
>> nothing further needs to be done. If not, this should be evaluated by
>> someone more familiar with the internals of the kernel block subsystem
>> than me.
> 
> The important bit in all the existing protocols, and which Linux relies
> on is that any write the Linux block layer got a completion for earlier
> needs to be flushed out to non-volatile storage when a FLUSH command is
> set.  Anything that still is in flight does not matter.  Which for
> NBD means anything that you already replies to need to be flushed.

... that's what it says (I hope).

> Or to say it more practicly - in the nbd server you simply need to
> call fdatasync on the backing device or file whenever you get a FLUSH
> requires, and it will do the right thing.

actually fdatasync() technically does more than is necessary, as it
will also flush commands that have been processed, but for which no
reply has yet been sent - that's no bad thing.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:11             ` Alex Bligh
@ 2016-09-15 12:18               ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:18 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:11:24PM +0100, Alex Bligh wrote:
> > NBD_CMD_FLUSH (3)
> > 
> > A flush request; a write barrier. 
> 
> I can see that's potentially confusing as isn't meant to mean 'an old-style
> linux kernel block device write barrier'. I think in general terms it
> probably is some form of barrier, but I see no problem in deleting the
> words "a write barrier" from the spec text if only to make it
> clearer.

Yes, please do that.  A "barrier" implies draining of the queue.

> However, I think the description of the command itself:
> 
> > The server MUST NOT send a successful reply header for this request before all write requests for which a reply has already been sent to the client have reached permanent storage (using fsync() or similar).
> 
> and the ordering section I pointed you to before, were both correct, yes?

Yes, this seems correct.

> actually fdatasync() technically does more than is necessary, as it
> will also flush commands that have been processed, but for which no
> reply has yet been sent - that's no bad thing.

Yes.  But without an actual barrier it's hard to be exact - and
fdatasync does the right thing by including false positives.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:18               ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:18 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:11:24PM +0100, Alex Bligh wrote:
> > NBD_CMD_FLUSH (3)
> > 
> > A flush request; a write barrier. 
> 
> I can see that's potentially confusing as isn't meant to mean 'an old-style
> linux kernel block device write barrier'. I think in general terms it
> probably is some form of barrier, but I see no problem in deleting the
> words "a write barrier" from the spec text if only to make it
> clearer.

Yes, please do that.  A "barrier" implies draining of the queue.

> However, I think the description of the command itself:
> 
> > The server MUST NOT send a successful reply header for this request before all write requests for which a reply has already been sent to the client have reached permanent storage (using fsync() or similar).
> 
> and the ordering section I pointed you to before, were both correct, yes?

Yes, this seems correct.

> actually fdatasync() technically does more than is necessary, as it
> will also flush commands that have been processed, but for which no
> reply has yet been sent - that's no bad thing.

Yes.  But without an actual barrier it's hard to be exact - and
fdatasync does the right thing by including false positives.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:01                 ` Wouter Verhelst
@ 2016-09-15 12:20                   ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:20 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, Alex Bligh, nbd-general, linux-block,
	Josef Bacik, linux-kernel, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 02:01:59PM +0200, Wouter Verhelst wrote:
> Yes. There was some discussion on that part, and we decided that setting
> the flag doesn't hurt, but the spec also clarifies that using it on READ
> does nothing, semantically.
>
> 
> The problem is that there are clients in the wild which do set it on
> READ, so it's just a matter of "be liberal in what you accept".

Note that FUA on READ in SCSI and NVMe does have a meaning - it
requires you to bypass any sort of cache on the target.  I think it's an
wrong defintion because it mandates implementation details that aren't
observable by the initiator, but it's still the spec wording and nbd
diverges from it.  That's not nessecarily a bad thing, but a caveat to
look out for.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:20                   ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:20 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, Alex Bligh, nbd-general, linux-block,
	Josef Bacik, linux-kernel, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 02:01:59PM +0200, Wouter Verhelst wrote:
> Yes. There was some discussion on that part, and we decided that setting
> the flag doesn't hurt, but the spec also clarifies that using it on READ
> does nothing, semantically.
>
> 
> The problem is that there are clients in the wild which do set it on
> READ, so it's just a matter of "be liberal in what you accept".

Note that FUA on READ in SCSI and NVMe does have a meaning - it
requires you to bypass any sort of cache on the target.  I think it's an
wrong defintion because it mandates implementation details that aren't
observable by the initiator, but it's still the spec wording and nbd
diverges from it.  That's not nessecarily a bad thing, but a caveat to
look out for.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:01         ` Christoph Hellwig
  2016-09-15 12:11             ` Alex Bligh
@ 2016-09-15 12:21           ` Wouter Verhelst
  2016-09-15 12:23             ` Christoph Hellwig
  1 sibling, 1 reply; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 12:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, linux-block, Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 05:01:25AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 01:55:14PM +0200, Wouter Verhelst wrote:
> > If that's not a write barrier, then I was using the wrong terminology
> > (and offer my apologies for the confusion).
> 
> It's not a write barrier - a write barrier was command that ensured that
[...]

Okay, I'll update the spec to remove that term then.

> > The point still remains that "X was sent before Y" is difficult to
> > determine on the client side if X was sent over a different TCP channel
> > than Y, because a packet might be dropped (requiring a retransmit) for
> > X, and similar things. If blk-mq can deal with that, we're good and
> > nothing further needs to be done. If not, this should be evaluated by
> > someone more familiar with the internals of the kernel block subsystem
> > than me.
> 
> The important bit in all the existing protocols, and which Linux relies
> on is that any write the Linux block layer got a completion for earlier
> needs to be flushed out to non-volatile storage when a FLUSH command is
> set.  Anything that still is in flight does not matter.  Which for
> NBD means anything that you already replies to need to be flushed.
> 
> Or to say it more practicly - in the nbd server you simply need to
> call fdatasync on the backing device or file whenever you get a FLUSH
> requires, and it will do the right thing.

Right. So do I understand you correctly that blk-mq currently doesn't
look at multiple queues, and just assumes that if a FLUSH is sent over
any one of the queues, it applies to all queues?

If so, I'll update the spec to clarify that servers should ensure this
holds in the case of multiple connections, or not allow writes, or not
allow multiple connections.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:21           ` Wouter Verhelst
@ 2016-09-15 12:23             ` Christoph Hellwig
  2016-09-15 12:33                 ` Alex Bligh
  0 siblings, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:23 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, linux-block, Josef Bacik,
	linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 02:21:20PM +0200, Wouter Verhelst wrote:
> Right. So do I understand you correctly that blk-mq currently doesn't
> look at multiple queues, and just assumes that if a FLUSH is sent over
> any one of the queues, it applies to all queues?

Yes.  The same is true at the protocol level for NVMe or SCSI transports
that can make use of multiple queues.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:20                   ` Christoph Hellwig
@ 2016-09-15 12:26                     ` Wouter Verhelst
  -1 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 12:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, linux-block, Josef Bacik, linux-kernel,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 05:20:08AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 02:01:59PM +0200, Wouter Verhelst wrote:
> > Yes. There was some discussion on that part, and we decided that setting
> > the flag doesn't hurt, but the spec also clarifies that using it on READ
> > does nothing, semantically.
> >
> > 
> > The problem is that there are clients in the wild which do set it on
> > READ, so it's just a matter of "be liberal in what you accept".
> 
> Note that FUA on READ in SCSI and NVMe does have a meaning - it
> requires you to bypass any sort of cache on the target.  I think it's an
> wrong defintion because it mandates implementation details that aren't
> observable by the initiator, but it's still the spec wording and nbd
> diverges from it.  That's not nessecarily a bad thing, but a caveat to
> look out for.

Yes. I think the kernel nbd driver should probably filter out FUA on
READ. It has no meaning in the case of nbd, and whatever expectations
the kernel may have cannot be provided for by nbd anyway.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:26                     ` Wouter Verhelst
  0 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 12:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, linux-block, Josef Bacik, linux-kernel,
	Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 05:20:08AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 02:01:59PM +0200, Wouter Verhelst wrote:
> > Yes. There was some discussion on that part, and we decided that setting
> > the flag doesn't hurt, but the spec also clarifies that using it on READ
> > does nothing, semantically.
> >
> > 
> > The problem is that there are clients in the wild which do set it on
> > READ, so it's just a matter of "be liberal in what you accept".
> 
> Note that FUA on READ in SCSI and NVMe does have a meaning - it
> requires you to bypass any sort of cache on the target.  I think it's an
> wrong defintion because it mandates implementation details that aren't
> observable by the initiator, but it's still the spec wording and nbd
> diverges from it.  That's not nessecarily a bad thing, but a caveat to
> look out for.

Yes. I think the kernel nbd driver should probably filter out FUA on
READ. It has no meaning in the case of nbd, and whatever expectations
the kernel may have cannot be provided for by nbd anyway.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:26                     ` Wouter Verhelst
@ 2016-09-15 12:27                       ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:27 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, linux-block, Josef Bacik,
	linux-kernel, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 02:26:31PM +0200, Wouter Verhelst wrote:
> Yes. I think the kernel nbd driver should probably filter out FUA on
> READ. It has no meaning in the case of nbd, and whatever expectations
> the kernel may have cannot be provided for by nbd anyway.

The kernel never sets FUA on reads - I just pointed out that other
protocols have defined (although horrible) semantics for it.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:27                       ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:27 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, linux-block, Josef Bacik,
	linux-kernel, Markus Pargmann, kernel-team

On Thu, Sep 15, 2016 at 02:26:31PM +0200, Wouter Verhelst wrote:
> Yes. I think the kernel nbd driver should probably filter out FUA on
> READ. It has no meaning in the case of nbd, and whatever expectations
> the kernel may have cannot be provided for by nbd anyway.

The kernel never sets FUA on reads - I just pointed out that other
protocols have defined (although horrible) semantics for it.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:18               ` Christoph Hellwig
@ 2016-09-15 12:28                 ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 13:18, Christoph Hellwig <hch@infradead.org> wrote:
> 
> Yes, please do that.  A "barrier" implies draining of the queue.

Done

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:28                 ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, Markus Pargmann, kernel-team


> On 15 Sep 2016, at 13:18, Christoph Hellwig <hch@infradead.org> wrote:
> 
> Yes, please do that.  A "barrier" implies draining of the queue.

Done

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:23             ` Christoph Hellwig
@ 2016-09-15 12:33                 ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:23, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 02:21:20PM +0200, Wouter Verhelst wrote:
>> Right. So do I understand you correctly that blk-mq currently doesn't
>> look at multiple queues, and just assumes that if a FLUSH is sent over
>> any one of the queues, it applies to all queues?
> 
> Yes.  The same is true at the protocol level for NVMe or SCSI transports
> that can make use of multiple queues.

At an implementation level that is going to be a little difficult
for some NBD servers, e.g. ones that fork() a different process per
connection. There is in general no IPC to speak of between server
instances. Such servers would thus be unsafe with more than one
connection if FLUSH is in use.

I believe such servers include the reference server where there is
process per connection (albeit possibly with several threads).

Even single process servers (including mine - gonbdserver) would
require logic to pair up multiple connections to the same
device.

I suspect ALL nbd servers would require a change. So I think we should
think carefully before introducing this protocol change.

It might be easier to make the client code change work around this
issue (somehow).

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:33                 ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:23, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 02:21:20PM +0200, Wouter Verhelst wrote:
>> Right. So do I understand you correctly that blk-mq currently doesn't
>> look at multiple queues, and just assumes that if a FLUSH is sent over
>> any one of the queues, it applies to all queues?
> 
> Yes.  The same is true at the protocol level for NVMe or SCSI transports
> that can make use of multiple queues.

At an implementation level that is going to be a little difficult
for some NBD servers, e.g. ones that fork() a different process per
connection. There is in general no IPC to speak of between server
instances. Such servers would thus be unsafe with more than one
connection if FLUSH is in use.

I believe such servers include the reference server where there is
process per connection (albeit possibly with several threads).

Even single process servers (including mine - gonbdserver) would
require logic to pair up multiple connections to the same
device.

I suspect ALL nbd servers would require a change. So I think we should
think carefully before introducing this protocol change.

It might be easier to make the client code change work around this
issue (somehow).

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:33                 ` Alex Bligh
@ 2016-09-15 12:36                   ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:36 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:33:20PM +0100, Alex Bligh wrote:
> At an implementation level that is going to be a little difficult
> for some NBD servers, e.g. ones that fork() a different process per
> connection. There is in general no IPC to speak of between server
> instances. Such servers would thus be unsafe with more than one
> connection if FLUSH is in use.
> 
> I believe such servers include the reference server where there is
> process per connection (albeit possibly with several threads).
> 
> Even single process servers (including mine - gonbdserver) would
> require logic to pair up multiple connections to the same
> device.

Why?  If you only send the completion after your I/O syscall returned
your are fine if fsync comes from a difference process, no matter
if you're using direct or buffered I/O underneath.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:36                   ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:36 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:33:20PM +0100, Alex Bligh wrote:
> At an implementation level that is going to be a little difficult
> for some NBD servers, e.g. ones that fork() a different process per
> connection. There is in general no IPC to speak of between server
> instances. Such servers would thus be unsafe with more than one
> connection if FLUSH is in use.
> 
> I believe such servers include the reference server where there is
> process per connection (albeit possibly with several threads).
> 
> Even single process servers (including mine - gonbdserver) would
> require logic to pair up multiple connections to the same
> device.

Why?  If you only send the completion after your I/O syscall returned
your are fine if fsync comes from a difference process, no matter
if you're using direct or buffered I/O underneath.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:36                   ` Christoph Hellwig
@ 2016-09-15 12:39                     ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:36, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 01:33:20PM +0100, Alex Bligh wrote:
>> At an implementation level that is going to be a little difficult
>> for some NBD servers, e.g. ones that fork() a different process per
>> connection. There is in general no IPC to speak of between server
>> instances. Such servers would thus be unsafe with more than one
>> connection if FLUSH is in use.
>> 
>> I believe such servers include the reference server where there is
>> process per connection (albeit possibly with several threads).
>> 
>> Even single process servers (including mine - gonbdserver) would
>> require logic to pair up multiple connections to the same
>> device.
> 
> Why?  If you only send the completion after your I/O syscall returned
> your are fine if fsync comes from a difference process, no matter
> if you're using direct or buffered I/O underneath.

That's probably right in the case of file-based back ends that
are running on a Linux OS. But gonbdserver for instance supports
(e.g.) Ceph based backends, where each connection might be talking
to a completely separate ceph node, and there may be no cache
consistency between connections.

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:39                     ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:36, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 01:33:20PM +0100, Alex Bligh wrote:
>> At an implementation level that is going to be a little difficult
>> for some NBD servers, e.g. ones that fork() a different process per
>> connection. There is in general no IPC to speak of between server
>> instances. Such servers would thus be unsafe with more than one
>> connection if FLUSH is in use.
>> 
>> I believe such servers include the reference server where there is
>> process per connection (albeit possibly with several threads).
>> 
>> Even single process servers (including mine - gonbdserver) would
>> require logic to pair up multiple connections to the same
>> device.
> 
> Why?  If you only send the completion after your I/O syscall returned
> your are fine if fsync comes from a difference process, no matter
> if you're using direct or buffered I/O underneath.

That's probably right in the case of file-based back ends that
are running on a Linux OS. But gonbdserver for instance supports
(e.g.) Ceph based backends, where each connection might be talking
to a completely separate ceph node, and there may be no cache
consistency between connections.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:39                     ` Alex Bligh
@ 2016-09-15 12:41                       ` Christoph Hellwig
  -1 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:41 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
> That's probably right in the case of file-based back ends that
> are running on a Linux OS. But gonbdserver for instance supports
> (e.g.) Ceph based backends, where each connection might be talking
> to a completely separate ceph node, and there may be no cache
> consistency between connections.

Yes, if you don't have a cache coherent backend you are generally
screwed with a multiqueue protocol.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:41                       ` Christoph Hellwig
  0 siblings, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2016-09-15 12:41 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
> That's probably right in the case of file-based back ends that
> are running on a Linux OS. But gonbdserver for instance supports
> (e.g.) Ceph based backends, where each connection might be talking
> to a completely separate ceph node, and there may be no cache
> consistency between connections.

Yes, if you don't have a cache coherent backend you are generally
screwed with a multiqueue protocol.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:41                       ` Christoph Hellwig
@ 2016-09-15 12:44                         ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
>> That's probably right in the case of file-based back ends that
>> are running on a Linux OS. But gonbdserver for instance supports
>> (e.g.) Ceph based backends, where each connection might be talking
>> to a completely separate ceph node, and there may be no cache
>> consistency between connections.
> 
> Yes, if you don't have a cache coherent backend you are generally
> screwed with a multiqueue protocol.

I wonder if the ability to support multiqueue should be visible
in the negotiation stage. That would allow the client to refuse
to select multiqueue where it isn't safe.

Wouter?

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 12:44                         ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 12:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, nbd-general, linux-block,
	Josef Bacik, linux-kernel, mpa, kernel-team


> On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
>> That's probably right in the case of file-based back ends that
>> are running on a Linux OS. But gonbdserver for instance supports
>> (e.g.) Ceph based backends, where each connection might be talking
>> to a completely separate ceph node, and there may be no cache
>> consistency between connections.
> 
> Yes, if you don't have a cache coherent backend you are generally
> screwed with a multiqueue protocol.

I wonder if the ability to support multiqueue should be visible
in the negotiation stage. That would allow the client to refuse
to select multiqueue where it isn't safe.

Wouter?

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 12:44                         ` Alex Bligh
@ 2016-09-15 13:17                           ` Wouter Verhelst
  -1 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 13:17 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, nbd-general, Josef Bacik, linux-kernel,
	linux-block, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:44:29PM +0100, Alex Bligh wrote:
> 
> > On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
> > 
> > On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
> >> That's probably right in the case of file-based back ends that
> >> are running on a Linux OS. But gonbdserver for instance supports
> >> (e.g.) Ceph based backends, where each connection might be talking
> >> to a completely separate ceph node, and there may be no cache
> >> consistency between connections.
> > 
> > Yes, if you don't have a cache coherent backend you are generally
> > screwed with a multiqueue protocol.
> 
> I wonder if the ability to support multiqueue should be visible
> in the negotiation stage. That would allow the client to refuse
> to select multiqueue where it isn't safe.

The server can always refuse to allow multiple connections.

I was thinking of changing the spec as follows:

diff --git a/doc/proto.md b/doc/proto.md
index 217f57e..cb099e2 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -308,6 +308,23 @@ specification, the
 [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
 may be useful.
 
+For performance reasons, clients MAY open multiple connections to the
+same server. To support such clients, servers SHOULD ensure that at
+least one of the following conditions hold:
+
+* Flush commands are processed for ALL connections. That is, when an
+  `NBD_CMD_WRITE` is processed on one connection, and then an
+  `NBD_CMD_FLUSH` is processed on another connection, the data of the
+  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
+  before the reply of the `NBD_CMD_FLUSH` is sent.
+* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
+  connection
+* Multiple connections are not allowed
+
+In addition, clients using multiple connections SHOULD NOT send
+`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
+the flush has not been replied to yet.
+
 #### Request message
 
 The request message, sent by the client, looks as follows:

The latter bit (on the client side) is because even if your backend has
no cache coherency issues, TCP does not guarantee ordering between
multiple connections. I don't know if the above is in line with what
blk-mq does, but consider the following scenario:

- A client sends two writes to the server, followed (immediately) by a
  flush, where at least the second write and the flush are not sent over
  the same connection.
- The first write is a small one, and it is handled almost immediately.
- The second write takes a little longer, so the flush is handled
  earlier than the second write
- The network packet containing the flush reply gets lost for whatever
  reason, so the client doesn't get it, and we fall into TCP
  retransmits.
- The second write finishes, and its reply header does not get lost
- After the second write reply reaches the client, the TCP retransmits
  for the flush reply are handled.

In the above scenario, the flush reply arrives on the client side after
a write reply which it did not cover; so the client will (incorrectly)
assume that the write has reached permanent storage when in fact it may
not have done so yet.

If the kernel does not care about the ordering of the two writes versus
the flush, then there is no problem. I don't know how blk-mq works in
that context, but if the above is a likely scenario, we may have to
reconsider adding blk-mq to nbd.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply related	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 13:17                           ` Wouter Verhelst
  0 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 13:17 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, nbd-general, Josef Bacik, linux-kernel,
	linux-block, mpa, kernel-team

On Thu, Sep 15, 2016 at 01:44:29PM +0100, Alex Bligh wrote:
> 
> > On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
> > 
> > On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
> >> That's probably right in the case of file-based back ends that
> >> are running on a Linux OS. But gonbdserver for instance supports
> >> (e.g.) Ceph based backends, where each connection might be talking
> >> to a completely separate ceph node, and there may be no cache
> >> consistency between connections.
> > 
> > Yes, if you don't have a cache coherent backend you are generally
> > screwed with a multiqueue protocol.
> 
> I wonder if the ability to support multiqueue should be visible
> in the negotiation stage. That would allow the client to refuse
> to select multiqueue where it isn't safe.

The server can always refuse to allow multiple connections.

I was thinking of changing the spec as follows:

diff --git a/doc/proto.md b/doc/proto.md
index 217f57e..cb099e2 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -308,6 +308,23 @@ specification, the
 [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
 may be useful.
 
+For performance reasons, clients MAY open multiple connections to the
+same server. To support such clients, servers SHOULD ensure that at
+least one of the following conditions hold:
+
+* Flush commands are processed for ALL connections. That is, when an
+  `NBD_CMD_WRITE` is processed on one connection, and then an
+  `NBD_CMD_FLUSH` is processed on another connection, the data of the
+  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
+  before the reply of the `NBD_CMD_FLUSH` is sent.
+* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
+  connection
+* Multiple connections are not allowed
+
+In addition, clients using multiple connections SHOULD NOT send
+`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
+the flush has not been replied to yet.
+
 #### Request message
 
 The request message, sent by the client, looks as follows:

The latter bit (on the client side) is because even if your backend has
no cache coherency issues, TCP does not guarantee ordering between
multiple connections. I don't know if the above is in line with what
blk-mq does, but consider the following scenario:

- A client sends two writes to the server, followed (immediately) by a
  flush, where at least the second write and the flush are not sent over
  the same connection.
- The first write is a small one, and it is handled almost immediately.
- The second write takes a little longer, so the flush is handled
  earlier than the second write
- The network packet containing the flush reply gets lost for whatever
  reason, so the client doesn't get it, and we fall into TCP
  retransmits.
- The second write finishes, and its reply header does not get lost
- After the second write reply reaches the client, the TCP retransmits
  for the flush reply are handled.

In the above scenario, the flush reply arrives on the client side after
a write reply which it did not cover; so the client will (incorrectly)
assume that the write has reached permanent storage when in fact it may
not have done so yet.

If the kernel does not care about the ordering of the two writes versus
the flush, then there is no problem. I don't know how blk-mq works in
that context, but if the above is a likely scenario, we may have to
reconsider adding blk-mq to nbd.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply related	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 11:09       ` Alex Bligh
@ 2016-09-15 13:34         ` Eric Blake
  -1 siblings, 0 replies; 95+ messages in thread
From: Eric Blake @ 2016-09-15 13:34 UTC (permalink / raw)
  To: Alex Bligh, Wouter Verhelst, Josef Bacik
  Cc: linux-block, Markus Pargmann, kernel-team, nbd-general,
	linux-kernel, Paolo Bonzini


[-- Attachment #1.1: Type: text/plain, Size: 1985 bytes --]

On 09/15/2016 06:09 AM, Alex Bligh wrote:
> 
> I also wonder whether any servers that can do caching per
> connection will always share a consistent cache between 
> connections. The one I'm worried about in particular here
> is qemu-nbd - Eric Blake CC'd.
> 

I doubt that qemu-nbd would ever want to support the situation with more
than one client connection writing to the same image at the same time;
the implications of sorting out data consistency between multiple
writers is rather complex and not worth coding into qemu.  So I think
qemu would probably prefer to just prohibit the multiple writer
situation.  And while multiple readers with no writer should be fine,
I'm not even sure if multiple readers plus one writer can always be made
to appear sane (if there is no coordination between the different
connections, on an image where the writer changes AA to BA then flushes
then changes to BB, it is still feasible that a reader could see AB
(pre-flush state of the first sector, post-flush changes to the second
sector, even though the writer never flushed that particular content to
disk).

But Paolo Bonzini (cc'd) may have more insight on qemu's NBD server and
what it supports (or forbids) in the way of multiple clients to a single
server.

> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

qemu-nbd is definitely capable of serving reads and writes out-of-order
to a single connection client; but that's different than the case with
multiple connections.

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 13:34         ` Eric Blake
  0 siblings, 0 replies; 95+ messages in thread
From: Eric Blake @ 2016-09-15 13:34 UTC (permalink / raw)
  To: Alex Bligh, Wouter Verhelst, Josef Bacik
  Cc: linux-block, Markus Pargmann, kernel-team, nbd-general,
	linux-kernel, Paolo Bonzini


[-- Attachment #1.1: Type: text/plain, Size: 1985 bytes --]

On 09/15/2016 06:09 AM, Alex Bligh wrote:
> 
> I also wonder whether any servers that can do caching per
> connection will always share a consistent cache between 
> connections. The one I'm worried about in particular here
> is qemu-nbd - Eric Blake CC'd.
> 

I doubt that qemu-nbd would ever want to support the situation with more
than one client connection writing to the same image at the same time;
the implications of sorting out data consistency between multiple
writers is rather complex and not worth coding into qemu.  So I think
qemu would probably prefer to just prohibit the multiple writer
situation.  And while multiple readers with no writer should be fine,
I'm not even sure if multiple readers plus one writer can always be made
to appear sane (if there is no coordination between the different
connections, on an image where the writer changes AA to BA then flushes
then changes to BB, it is still feasible that a reader could see AB
(pre-flush state of the first sector, post-flush changes to the second
sector, even though the writer never flushed that particular content to
disk).

But Paolo Bonzini (cc'd) may have more insight on qemu's NBD server and
what it supports (or forbids) in the way of multiple clients to a single
server.

> A more general point is that with multiple queues requests
> may be processed in a different order even by those servers that
> currently process the requests in strict order, or in something
> similar to strict order. The server is permitted by the spec
> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
> process commands out of order anyway, but I suspect this has
> to date been little tested.

qemu-nbd is definitely capable of serving reads and writes out-of-order
to a single connection client; but that's different than the case with
multiple connections.

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 13:17                           ` Wouter Verhelst
@ 2016-09-15 13:57                             ` Josef Bacik
  -1 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-15 13:57 UTC (permalink / raw)
  To: Wouter Verhelst, Alex Bligh
  Cc: Christoph Hellwig, nbd-general, linux-kernel, linux-block, mpa,
	kernel-team

On 09/15/2016 09:17 AM, Wouter Verhelst wrote:
> On Thu, Sep 15, 2016 at 01:44:29PM +0100, Alex Bligh wrote:
>>
>>> On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
>>>
>>> On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
>>>> That's probably right in the case of file-based back ends that
>>>> are running on a Linux OS. But gonbdserver for instance supports
>>>> (e.g.) Ceph based backends, where each connection might be talking
>>>> to a completely separate ceph node, and there may be no cache
>>>> consistency between connections.
>>>
>>> Yes, if you don't have a cache coherent backend you are generally
>>> screwed with a multiqueue protocol.
>>
>> I wonder if the ability to support multiqueue should be visible
>> in the negotiation stage. That would allow the client to refuse
>> to select multiqueue where it isn't safe.
>
> The server can always refuse to allow multiple connections.
>
> I was thinking of changing the spec as follows:
>
> diff --git a/doc/proto.md b/doc/proto.md
> index 217f57e..cb099e2 100644
> --- a/doc/proto.md
> +++ b/doc/proto.md
> @@ -308,6 +308,23 @@ specification, the
>  [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
>  may be useful.
>
> +For performance reasons, clients MAY open multiple connections to the
> +same server. To support such clients, servers SHOULD ensure that at
> +least one of the following conditions hold:
> +
> +* Flush commands are processed for ALL connections. That is, when an
> +  `NBD_CMD_WRITE` is processed on one connection, and then an
> +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
> +  before the reply of the `NBD_CMD_FLUSH` is sent.
> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
> +  connection
> +* Multiple connections are not allowed
> +
> +In addition, clients using multiple connections SHOULD NOT send
> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
> +the flush has not been replied to yet.
> +
>  #### Request message
>
>  The request message, sent by the client, looks as follows:
>
> The latter bit (on the client side) is because even if your backend has
> no cache coherency issues, TCP does not guarantee ordering between
> multiple connections. I don't know if the above is in line with what
> blk-mq does, but consider the following scenario:
>
> - A client sends two writes to the server, followed (immediately) by a
>   flush, where at least the second write and the flush are not sent over
>   the same connection.
> - The first write is a small one, and it is handled almost immediately.
> - The second write takes a little longer, so the flush is handled
>   earlier than the second write
> - The network packet containing the flush reply gets lost for whatever
>   reason, so the client doesn't get it, and we fall into TCP
>   retransmits.
> - The second write finishes, and its reply header does not get lost
> - After the second write reply reaches the client, the TCP retransmits
>   for the flush reply are handled.
>
> In the above scenario, the flush reply arrives on the client side after
> a write reply which it did not cover; so the client will (incorrectly)
> assume that the write has reached permanent storage when in fact it may
> not have done so yet.

This isn't an NBD problem, this is an application problem.  The application must 
wait for all writes it cares about _before_ issuing a flush.  This is the same 
as for normal storage as it is for NBD.  It is not NBD's responsibility to 
maintain coherency between multiple requests across connections, just simply to 
act on and respond to requests.

I think changing the specification to indicate that this is the case for 
multiple connections is a good thing, to keep NBD servers from doing weird 
things like sending different connections to the same export to different 
backing stores without some sort of synchronization.  It should definitely be 
explicitly stated somewhere that NBD does not provide any ordering guarantees 
and that is up to the application.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 13:57                             ` Josef Bacik
  0 siblings, 0 replies; 95+ messages in thread
From: Josef Bacik @ 2016-09-15 13:57 UTC (permalink / raw)
  To: Wouter Verhelst, Alex Bligh
  Cc: Christoph Hellwig, nbd-general, linux-kernel, linux-block, mpa,
	kernel-team

On 09/15/2016 09:17 AM, Wouter Verhelst wrote:
> On Thu, Sep 15, 2016 at 01:44:29PM +0100, Alex Bligh wrote:
>>
>>> On 15 Sep 2016, at 13:41, Christoph Hellwig <hch@infradead.org> wrote:
>>>
>>> On Thu, Sep 15, 2016 at 01:39:11PM +0100, Alex Bligh wrote:
>>>> That's probably right in the case of file-based back ends that
>>>> are running on a Linux OS. But gonbdserver for instance supports
>>>> (e.g.) Ceph based backends, where each connection might be talking
>>>> to a completely separate ceph node, and there may be no cache
>>>> consistency between connections.
>>>
>>> Yes, if you don't have a cache coherent backend you are generally
>>> screwed with a multiqueue protocol.
>>
>> I wonder if the ability to support multiqueue should be visible
>> in the negotiation stage. That would allow the client to refuse
>> to select multiqueue where it isn't safe.
>
> The server can always refuse to allow multiple connections.
>
> I was thinking of changing the spec as follows:
>
> diff --git a/doc/proto.md b/doc/proto.md
> index 217f57e..cb099e2 100644
> --- a/doc/proto.md
> +++ b/doc/proto.md
> @@ -308,6 +308,23 @@ specification, the
>  [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
>  may be useful.
>
> +For performance reasons, clients MAY open multiple connections to the
> +same server. To support such clients, servers SHOULD ensure that at
> +least one of the following conditions hold:
> +
> +* Flush commands are processed for ALL connections. That is, when an
> +  `NBD_CMD_WRITE` is processed on one connection, and then an
> +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
> +  before the reply of the `NBD_CMD_FLUSH` is sent.
> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
> +  connection
> +* Multiple connections are not allowed
> +
> +In addition, clients using multiple connections SHOULD NOT send
> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
> +the flush has not been replied to yet.
> +
>  #### Request message
>
>  The request message, sent by the client, looks as follows:
>
> The latter bit (on the client side) is because even if your backend has
> no cache coherency issues, TCP does not guarantee ordering between
> multiple connections. I don't know if the above is in line with what
> blk-mq does, but consider the following scenario:
>
> - A client sends two writes to the server, followed (immediately) by a
>   flush, where at least the second write and the flush are not sent over
>   the same connection.
> - The first write is a small one, and it is handled almost immediately.
> - The second write takes a little longer, so the flush is handled
>   earlier than the second write
> - The network packet containing the flush reply gets lost for whatever
>   reason, so the client doesn't get it, and we fall into TCP
>   retransmits.
> - The second write finishes, and its reply header does not get lost
> - After the second write reply reaches the client, the TCP retransmits
>   for the flush reply are handled.
>
> In the above scenario, the flush reply arrives on the client side after
> a write reply which it did not cover; so the client will (incorrectly)
> assume that the write has reached permanent storage when in fact it may
> not have done so yet.

This isn't an NBD problem, this is an application problem.  The application must 
wait for all writes it cares about _before_ issuing a flush.  This is the same 
as for normal storage as it is for NBD.  It is not NBD's responsibility to 
maintain coherency between multiple requests across connections, just simply to 
act on and respond to requests.

I think changing the specification to indicate that this is the case for 
multiple connections is a good thing, to keep NBD servers from doing weird 
things like sending different connections to the same export to different 
backing stores without some sort of synchronization.  It should definitely be 
explicitly stated somewhere that NBD does not provide any ordering guarantees 
and that is up to the application.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 13:34         ` Eric Blake
@ 2016-09-15 14:07           ` Paolo Bonzini
  -1 siblings, 0 replies; 95+ messages in thread
From: Paolo Bonzini @ 2016-09-15 14:07 UTC (permalink / raw)
  To: Eric Blake, Alex Bligh, Wouter Verhelst, Josef Bacik
  Cc: linux-block, Markus Pargmann, kernel-team, nbd-general, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2356 bytes --]



On 15/09/2016 15:34, Eric Blake wrote:
> On 09/15/2016 06:09 AM, Alex Bligh wrote:
>>
>> I also wonder whether any servers that can do caching per
>> connection will always share a consistent cache between 
>> connections. The one I'm worried about in particular here
>> is qemu-nbd - Eric Blake CC'd.
>>
> 
> I doubt that qemu-nbd would ever want to support the situation with more
> than one client connection writing to the same image at the same time;
> the implications of sorting out data consistency between multiple
> writers is rather complex and not worth coding into qemu.  So I think
> qemu would probably prefer to just prohibit the multiple writer
> situation.  And while multiple readers with no writer should be fine,
> I'm not even sure if multiple readers plus one writer can always be made
> to appear sane (if there is no coordination between the different
> connections, on an image where the writer changes AA to BA then flushes
> then changes to BB, it is still feasible that a reader could see AB
> (pre-flush state of the first sector, post-flush changes to the second
> sector, even though the writer never flushed that particular content to
> disk).
> 
> But Paolo Bonzini (cc'd) may have more insight on qemu's NBD server and
> what it supports (or forbids) in the way of multiple clients to a single
> server.

I don't think QEMU forbids multiple clients to the single server, and
guarantees consistency as long as there is no overlap between writes and
reads.  These are the same guarantees you have for multiple commands on
a single connection.

In other words, from the POV of QEMU there's no difference whether
multiple commands come from one or more connections.

Paolo

>> A more general point is that with multiple queues requests
>> may be processed in a different order even by those servers that
>> currently process the requests in strict order, or in something
>> similar to strict order. The server is permitted by the spec
>> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
>> process commands out of order anyway, but I suspect this has
>> to date been little tested.
> 
> qemu-nbd is definitely capable of serving reads and writes out-of-order
> to a single connection client; but that's different than the case with
> multiple connections.
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 14:07           ` Paolo Bonzini
  0 siblings, 0 replies; 95+ messages in thread
From: Paolo Bonzini @ 2016-09-15 14:07 UTC (permalink / raw)
  To: Eric Blake, Alex Bligh, Wouter Verhelst, Josef Bacik
  Cc: linux-block, Markus Pargmann, kernel-team, nbd-general, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2356 bytes --]



On 15/09/2016 15:34, Eric Blake wrote:
> On 09/15/2016 06:09 AM, Alex Bligh wrote:
>>
>> I also wonder whether any servers that can do caching per
>> connection will always share a consistent cache between 
>> connections. The one I'm worried about in particular here
>> is qemu-nbd - Eric Blake CC'd.
>>
> 
> I doubt that qemu-nbd would ever want to support the situation with more
> than one client connection writing to the same image at the same time;
> the implications of sorting out data consistency between multiple
> writers is rather complex and not worth coding into qemu.  So I think
> qemu would probably prefer to just prohibit the multiple writer
> situation.  And while multiple readers with no writer should be fine,
> I'm not even sure if multiple readers plus one writer can always be made
> to appear sane (if there is no coordination between the different
> connections, on an image where the writer changes AA to BA then flushes
> then changes to BB, it is still feasible that a reader could see AB
> (pre-flush state of the first sector, post-flush changes to the second
> sector, even though the writer never flushed that particular content to
> disk).
> 
> But Paolo Bonzini (cc'd) may have more insight on qemu's NBD server and
> what it supports (or forbids) in the way of multiple clients to a single
> server.

I don't think QEMU forbids multiple clients to the single server, and
guarantees consistency as long as there is no overlap between writes and
reads.  These are the same guarantees you have for multiple commands on
a single connection.

In other words, from the POV of QEMU there's no difference whether
multiple commands come from one or more connections.

Paolo

>> A more general point is that with multiple queues requests
>> may be processed in a different order even by those servers that
>> currently process the requests in strict order, or in something
>> similar to strict order. The server is permitted by the spec
>> (save as mandated by NBD_CMD_FLUSH and NBD_CMD_FLAG_FUA) to
>> process commands out of order anyway, but I suspect this has
>> to date been little tested.
> 
> qemu-nbd is definitely capable of serving reads and writes out-of-order
> to a single connection client; but that's different than the case with
> multiple connections.
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 13:57                             ` Josef Bacik
@ 2016-09-15 15:17                               ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:17 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, Christoph Hellwig, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

Josef,

> On 15 Sep 2016, at 14:57, Josef Bacik <jbacik@fb.com> wrote:
>=20
> This isn't an NBD problem, this is an application problem.  The =
application must wait for all writes it cares about _before_ issuing a =
flush.  This is the same as for normal storage as it is for NBD.  It is =
not NBD's responsibility to maintain coherency between multiple requests =
across connections, just simply to act on and respond to requests.
>=20
> I think changing the specification to indicate that this is the case =
for multiple connections is a good thing, to keep NBD servers from doing =
weird things like sending different connections to the same export to =
different backing stores without some sort of synchronization.  It =
should definitely be explicitly stated somewhere that NBD does not =
provide any ordering guarantees and that is up to the application.  =
Thanks,

I don't think that's correct.

The block stack issues a flush to mean (precisely) "do not reply to this =
until all preceding writes THAT HAVE BEEN REPLIED TO have been persisted =
to non-volatile storage".

The danger is with multiple connections (where apparently only one flush =
is sent - let's say down connection 1) that not al the writes that have =
been replied to on connection 2 have been persisted to non-volatile =
storage. Only the ones on connection 1 have been persisted (this is =
assuming the nbd server doesn't 'link' in some way the connections).

There's nothing the 'application' (here meaning the kernel or higher =
level) can do to mitigate this. Sure it can wait for all the replies, =
but this doesn't guarantee the writes have been persisted to =
non-volatile storage, precisely because writes may return prior to this.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 15:17                               ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:17 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, Christoph Hellwig, nbd-general,
	linux-kernel, linux-block, Markus Pargmann, kernel-team

Josef,

> On 15 Sep 2016, at 14:57, Josef Bacik <jbacik@fb.com> wrote:
> 
> This isn't an NBD problem, this is an application problem.  The application must wait for all writes it cares about _before_ issuing a flush.  This is the same as for normal storage as it is for NBD.  It is not NBD's responsibility to maintain coherency between multiple requests across connections, just simply to act on and respond to requests.
> 
> I think changing the specification to indicate that this is the case for multiple connections is a good thing, to keep NBD servers from doing weird things like sending different connections to the same export to different backing stores without some sort of synchronization.  It should definitely be explicitly stated somewhere that NBD does not provide any ordering guarantees and that is up to the application.  Thanks,

I don't think that's correct.

The block stack issues a flush to mean (precisely) "do not reply to this until all preceding writes THAT HAVE BEEN REPLIED TO have been persisted to non-volatile storage".

The danger is with multiple connections (where apparently only one flush is sent - let's say down connection 1) that not al the writes that have been replied to on connection 2 have been persisted to non-volatile storage. Only the ones on connection 1 have been persisted (this is assuming the nbd server doesn't 'link' in some way the connections).

There's nothing the 'application' (here meaning the kernel or higher level) can do to mitigate this. Sure it can wait for all the replies, but this doesn't guarantee the writes have been persisted to non-volatile storage, precisely because writes may return prior to this.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 14:07           ` Paolo Bonzini
@ 2016-09-15 15:23             ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:23 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Alex Bligh, Eric Blake, Wouter Verhelst, Josef Bacik,
	linux-block, Markus Pargmann, kernel-team, nbd-general,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1867 bytes --]

Paolo,

> On 15 Sep 2016, at 15:07, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> I don't think QEMU forbids multiple clients to the single server, and
> guarantees consistency as long as there is no overlap between writes and
> reads.  These are the same guarantees you have for multiple commands on
> a single connection.
> 
> In other words, from the POV of QEMU there's no difference whether
> multiple commands come from one or more connections.

This isn't really about ordering, it's about cache coherency
and persisting things to disk.

What you say is correct as far as it goes in terms of ordering. However
consider the scenario with read and writes on two channels as follows
of the same block:

     Channel1     Channel2

     R                      Block read, and cached in user space in
                            channel 1's cache
                            Reply sent

                  W         New value written, channel 2's cache updated
                            channel 1's cache not

     R                      Value returned from channel 1's cache.


In the above scenario, there is a problem if the server(s) handling the
two channels each use a read cache which is not coherent between the
two channels. An example would be a read-through cache on a server that
did fork() and shared no state between connections.

Similarly, if there is a write on channel 1 that has completed, and
the flush goes to channel 2, it may not (if state is not shared) guarantee
that the write on channel 1 (which has completed) is persisted to non-volatile
media. Obviously if the 'state' is OS block cache/buffers/whatever, it
will, but if it's (e.g.) a user-space per process write-through cache,
it won't.

I don't know whether qemu-nbd is likely to suffer from either of these.

--
Alex Bligh





[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 15:23             ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:23 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Alex Bligh, Eric Blake, Wouter Verhelst, Josef Bacik,
	linux-block, Markus Pargmann, kernel-team, nbd-general,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1867 bytes --]

Paolo,

> On 15 Sep 2016, at 15:07, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> I don't think QEMU forbids multiple clients to the single server, and
> guarantees consistency as long as there is no overlap between writes and
> reads.  These are the same guarantees you have for multiple commands on
> a single connection.
> 
> In other words, from the POV of QEMU there's no difference whether
> multiple commands come from one or more connections.

This isn't really about ordering, it's about cache coherency
and persisting things to disk.

What you say is correct as far as it goes in terms of ordering. However
consider the scenario with read and writes on two channels as follows
of the same block:

     Channel1     Channel2

     R                      Block read, and cached in user space in
                            channel 1's cache
                            Reply sent

                  W         New value written, channel 2's cache updated
                            channel 1's cache not

     R                      Value returned from channel 1's cache.


In the above scenario, there is a problem if the server(s) handling the
two channels each use a read cache which is not coherent between the
two channels. An example would be a read-through cache on a server that
did fork() and shared no state between connections.

Similarly, if there is a write on channel 1 that has completed, and
the flush goes to channel 2, it may not (if state is not shared) guarantee
that the write on channel 1 (which has completed) is persisted to non-volatile
media. Obviously if the 'state' is OS block cache/buffers/whatever, it
will, but if it's (e.g.) a user-space per process write-through cache,
it won't.

I don't know whether qemu-nbd is likely to suffer from either of these.

--
Alex Bligh





[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 13:34         ` Eric Blake
@ 2016-09-15 15:25           ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:25 UTC (permalink / raw)
  To: Eric Blake
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, linux-block,
	Markus Pargmann, kernel-team, nbd-general, linux-kernel,
	Paolo Bonzini

[-- Attachment #1: Type: text/plain, Size: 966 bytes --]

Eric,

> I doubt that qemu-nbd would ever want to support the situation with more
> than one client connection writing to the same image at the same time;
> the implications of sorting out data consistency between multiple
> writers is rather complex and not worth coding into qemu.  So I think
> qemu would probably prefer to just prohibit the multiple writer
> situation.

Yeah, I was thinking about a 'no multiple connection' flag.

>  And while multiple readers with no writer should be fine,
> I'm not even sure if multiple readers plus one writer can always be made
> to appear sane (if there is no coordination between the different
> connections, on an image where the writer changes AA to BA then flushes
> then changes to BB, it is still feasible that a reader could see AB
> (pre-flush state of the first sector, post-flush changes to the second
> sector, even though the writer never flushed that particular content to
> disk).

Agree

--
Alex Bligh





[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 15:25           ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 15:25 UTC (permalink / raw)
  To: Eric Blake
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, linux-block,
	Markus Pargmann, kernel-team, nbd-general, linux-kernel,
	Paolo Bonzini

[-- Attachment #1: Type: text/plain, Size: 966 bytes --]

Eric,

> I doubt that qemu-nbd would ever want to support the situation with more
> than one client connection writing to the same image at the same time;
> the implications of sorting out data consistency between multiple
> writers is rather complex and not worth coding into qemu.  So I think
> qemu would probably prefer to just prohibit the multiple writer
> situation.

Yeah, I was thinking about a 'no multiple connection' flag.

>  And while multiple readers with no writer should be fine,
> I'm not even sure if multiple readers plus one writer can always be made
> to appear sane (if there is no coordination between the different
> connections, on an image where the writer changes AA to BA then flushes
> then changes to BB, it is still feasible that a reader could see AB
> (pre-flush state of the first sector, post-flush changes to the second
> sector, even though the writer never flushed that particular content to
> disk).

Agree

--
Alex Bligh





[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 13:17                           ` Wouter Verhelst
@ 2016-09-15 16:08                             ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 16:08 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Christoph Hellwig, nbd-general, Josef Bacik,
	linux-kernel, linux-block, mpa, kernel-team

Wouter,

> The server can always refuse to allow multiple connections.

Sure, but it would be neater to warn the client of that
at negotiation stage (it would only be one flag, e.g.
'multiple connections unsafe'). That way the connection
won't fail with a cryptic EBUSY or whatever, but will
just negotiate a single connection.

> I was thinking of changing the spec as follows:
>=20
> diff --git a/doc/proto.md b/doc/proto.md
> index 217f57e..cb099e2 100644
> --- a/doc/proto.md
> +++ b/doc/proto.md
> @@ -308,6 +308,23 @@ specification, the
> [kernel =
documentation](https://www.kernel.org/doc/Documentation/block/writeback_ca=
che_control.txt)
> may be useful.
>=20
> +For performance reasons, clients MAY open multiple connections to the
> +same server. To support such clients, servers SHOULD ensure that at
> +least one of the following conditions hold:
> +
> +* Flush commands are processed for ALL connections. That is, when an
> +  `NBD_CMD_WRITE` is processed on one connection, and then an
> +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent =
storage
> +  before the reply of the `NBD_CMD_FLUSH` is sent.
> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most =
one
> +  connection
> +* Multiple connections are not allowed
> +
> +In addition, clients using multiple connections SHOULD NOT send
> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation =
to
> +the flush has not been replied to yet.
> +

I don't think that should be a mandatory behaviour. For once, it would
be reasonably easy on gonbdserver but a PITA on the reference server.
You'd need to put in IPC between each of the forked processes OR rely
on fdatasync() only - I'm not sure that would necessarily work
safely with (e.g.) the 'treefiles' / COW options.

I think better would be to say that the server MUST either

* Not support NBD_CMD_FLUSH at all
* Support NBD_CMD_FLUSH across channels (as you set out above), or
* Indicate that it does not support multiple channels.

> #### Request message
>=20
> The request message, sent by the client, looks as follows:
>=20
> The latter bit (on the client side) is because even if your backend =
has
> no cache coherency issues, TCP does not guarantee ordering between
> multiple connections. I don't know if the above is in line with what
> blk-mq does, but consider the following scenario:
>=20
> - A client sends two writes to the server, followed (immediately) by a
>  flush, where at least the second write and the flush are not sent =
over
>  the same connection.
> - The first write is a small one, and it is handled almost =
immediately.
> - The second write takes a little longer, so the flush is handled
>  earlier than the second write
> - The network packet containing the flush reply gets lost for whatever
>  reason, so the client doesn't get it, and we fall into TCP
>  retransmits.
> - The second write finishes, and its reply header does not get lost
> - After the second write reply reaches the client, the TCP retransmits
>  for the flush reply are handled.
>=20
> In the above scenario, the flush reply arrives on the client side =
after
> a write reply which it did not cover; so the client will (incorrectly)
> assume that the write has reached permanent storage when in fact it =
may
> not have done so yet.
>=20
> If the kernel does not care about the ordering of the two writes =
versus
> the flush, then there is no problem. I don't know how blk-mq works in
> that context, but if the above is a likely scenario, we may have to
> reconsider adding blk-mq to nbd.

Actually I think this is a problem anyway. A simpler failure case is
one where (by chance) one channel gets the writes, and one channel
gets the flushes. The flush reply is delayed beyond the replies to
unconnected writes (on the other channel) and hence the kernel thinks
replied-to writes have been persisted when they have not been.

The only way to fix that (as far as I can see) without changing flush
semantics is for the block layer to issue flush requests on each
channel when flushing on one channel. This, incidentally, would
resolve every other issue!

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 16:08                             ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 16:08 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Christoph Hellwig, nbd-general, Josef Bacik,
	linux-kernel, linux-block, mpa, kernel-team

Wouter,

> The server can always refuse to allow multiple connections.

Sure, but it would be neater to warn the client of that
at negotiation stage (it would only be one flag, e.g.
'multiple connections unsafe'). That way the connection
won't fail with a cryptic EBUSY or whatever, but will
just negotiate a single connection.

> I was thinking of changing the spec as follows:
> 
> diff --git a/doc/proto.md b/doc/proto.md
> index 217f57e..cb099e2 100644
> --- a/doc/proto.md
> +++ b/doc/proto.md
> @@ -308,6 +308,23 @@ specification, the
> [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
> may be useful.
> 
> +For performance reasons, clients MAY open multiple connections to the
> +same server. To support such clients, servers SHOULD ensure that at
> +least one of the following conditions hold:
> +
> +* Flush commands are processed for ALL connections. That is, when an
> +  `NBD_CMD_WRITE` is processed on one connection, and then an
> +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
> +  before the reply of the `NBD_CMD_FLUSH` is sent.
> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
> +  connection
> +* Multiple connections are not allowed
> +
> +In addition, clients using multiple connections SHOULD NOT send
> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
> +the flush has not been replied to yet.
> +

I don't think that should be a mandatory behaviour. For once, it would
be reasonably easy on gonbdserver but a PITA on the reference server.
You'd need to put in IPC between each of the forked processes OR rely
on fdatasync() only - I'm not sure that would necessarily work
safely with (e.g.) the 'treefiles' / COW options.

I think better would be to say that the server MUST either

* Not support NBD_CMD_FLUSH at all
* Support NBD_CMD_FLUSH across channels (as you set out above), or
* Indicate that it does not support multiple channels.

> #### Request message
> 
> The request message, sent by the client, looks as follows:
> 
> The latter bit (on the client side) is because even if your backend has
> no cache coherency issues, TCP does not guarantee ordering between
> multiple connections. I don't know if the above is in line with what
> blk-mq does, but consider the following scenario:
> 
> - A client sends two writes to the server, followed (immediately) by a
>  flush, where at least the second write and the flush are not sent over
>  the same connection.
> - The first write is a small one, and it is handled almost immediately.
> - The second write takes a little longer, so the flush is handled
>  earlier than the second write
> - The network packet containing the flush reply gets lost for whatever
>  reason, so the client doesn't get it, and we fall into TCP
>  retransmits.
> - The second write finishes, and its reply header does not get lost
> - After the second write reply reaches the client, the TCP retransmits
>  for the flush reply are handled.
> 
> In the above scenario, the flush reply arrives on the client side after
> a write reply which it did not cover; so the client will (incorrectly)
> assume that the write has reached permanent storage when in fact it may
> not have done so yet.
> 
> If the kernel does not care about the ordering of the two writes versus
> the flush, then there is no problem. I don't know how blk-mq works in
> that context, but if the above is a likely scenario, we may have to
> reconsider adding blk-mq to nbd.

Actually I think this is a problem anyway. A simpler failure case is
one where (by chance) one channel gets the writes, and one channel
gets the flushes. The flush reply is delayed beyond the replies to
unconnected writes (on the other channel) and hence the kernel thinks
replied-to writes have been persisted when they have not been.

The only way to fix that (as far as I can see) without changing flush
semantics is for the block layer to issue flush requests on each
channel when flushing on one channel. This, incidentally, would
resolve every other issue!

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 16:08                             ` Alex Bligh
@ 2016-09-15 16:27                               ` Wouter Verhelst
  -1 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 16:27 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, linux-block, Josef Bacik, linux-kernel,
	Christoph Hellwig, mpa, kernel-team

On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
> Wouter,
> 
> > The server can always refuse to allow multiple connections.
> 
> Sure, but it would be neater to warn the client of that at negotiation
> stage (it would only be one flag, e.g.  'multiple connections
> unsafe').

I suppose that's not a bad idea.

[...]
> > I was thinking of changing the spec as follows:
> > 
> > diff --git a/doc/proto.md b/doc/proto.md
> > index 217f57e..cb099e2 100644
> > --- a/doc/proto.md
> > +++ b/doc/proto.md
> > @@ -308,6 +308,23 @@ specification, the
> > [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
> > may be useful.
> > 
> > +For performance reasons, clients MAY open multiple connections to the
> > +same server. To support such clients, servers SHOULD ensure that at
> > +least one of the following conditions hold:
> > +
> > +* Flush commands are processed for ALL connections. That is, when an
> > +  `NBD_CMD_WRITE` is processed on one connection, and then an
> > +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> > +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
> > +  before the reply of the `NBD_CMD_FLUSH` is sent.
> > +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
> > +  connection
> > +* Multiple connections are not allowed
> > +
> > +In addition, clients using multiple connections SHOULD NOT send
> > +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
> > +the flush has not been replied to yet.
> > +
> 
> I don't think that should be a mandatory behaviour.

Which part of it?

> For once, it would
> be reasonably easy on gonbdserver but a PITA on the reference server.
> You'd need to put in IPC between each of the forked processes OR rely
> on fdatasync() only - I'm not sure that would necessarily work
> safely with (e.g.) the 'treefiles' / COW options.
> 
> I think better would be to say that the server MUST either
> 
> * Not support NBD_CMD_FLUSH at all

I think we should discourage not supporting FLUSH, rather than
suggesting it. 

> * Support NBD_CMD_FLUSH across channels (as you set out above), or
> * Indicate that it does not support multiple channels.

You dropped the one with no writes. I said "at most" there for a reason.
Originally I was going to say "if the server is read-only", but then
thought that it could work to do the "at most" thing. After having given
that some more thought, I now realize that if you write, you need to
flush across to other channels, regardless of whether they write too, so
that bit of it is moot now anyway.

Still, a server which exports read-only should still be safe for
multiple connections, even if there is no cache coherency (since
presumably nothing's going to change anyway).

[...]
> > The latter bit (on the client side) is because even if your backend has
> > no cache coherency issues, TCP does not guarantee ordering between
> > multiple connections. I don't know if the above is in line with what
> > blk-mq does, but consider the following scenario:
> > 
> > - A client sends two writes to the server, followed (immediately) by a
> >  flush, where at least the second write and the flush are not sent over
> >  the same connection.
> > - The first write is a small one, and it is handled almost immediately.
> > - The second write takes a little longer, so the flush is handled
> >  earlier than the second write
> > - The network packet containing the flush reply gets lost for whatever
> >  reason, so the client doesn't get it, and we fall into TCP
> >  retransmits.
> > - The second write finishes, and its reply header does not get lost
> > - After the second write reply reaches the client, the TCP retransmits
> >  for the flush reply are handled.
> > 
> > In the above scenario, the flush reply arrives on the client side after
> > a write reply which it did not cover; so the client will (incorrectly)
> > assume that the write has reached permanent storage when in fact it may
> > not have done so yet.
> > 
> > If the kernel does not care about the ordering of the two writes versus
> > the flush, then there is no problem. I don't know how blk-mq works in
> > that context, but if the above is a likely scenario, we may have to
> > reconsider adding blk-mq to nbd.
> 
> Actually I think this is a problem anyway. A simpler failure case is
> one where (by chance) one channel gets the writes, and one channel
> gets the flushes. The flush reply is delayed beyond the replies to
> unconnected writes (on the other channel) and hence the kernel thinks
> replied-to writes have been persisted when they have not been.

Yes, that is another example of essentially the same problem.

> The only way to fix that (as far as I can see) without changing flush
> semantics is for the block layer to issue flush requests on each
> channel when flushing on one channel.

Christoph just said that that doesn't (currently) happen; I don't know
whether the kernel currently already (optimistically) sends out flush
requests before the writes that it expects to hit permanent storage have
finished, but if it doesn't do that, then there is no problem and my
suggested bit of spec would be okay.

If there are good reasons to do so, however, we do indeed have a problem
and something else is necessary. I don't think flushing across all
connections is the best solution, though.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 16:27                               ` Wouter Verhelst
  0 siblings, 0 replies; 95+ messages in thread
From: Wouter Verhelst @ 2016-09-15 16:27 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, linux-block, Josef Bacik, linux-kernel,
	Christoph Hellwig, mpa, kernel-team

On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
> Wouter,
> 
> > The server can always refuse to allow multiple connections.
> 
> Sure, but it would be neater to warn the client of that at negotiation
> stage (it would only be one flag, e.g.  'multiple connections
> unsafe').

I suppose that's not a bad idea.

[...]
> > I was thinking of changing the spec as follows:
> > 
> > diff --git a/doc/proto.md b/doc/proto.md
> > index 217f57e..cb099e2 100644
> > --- a/doc/proto.md
> > +++ b/doc/proto.md
> > @@ -308,6 +308,23 @@ specification, the
> > [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
> > may be useful.
> > 
> > +For performance reasons, clients MAY open multiple connections to the
> > +same server. To support such clients, servers SHOULD ensure that at
> > +least one of the following conditions hold:
> > +
> > +* Flush commands are processed for ALL connections. That is, when an
> > +  `NBD_CMD_WRITE` is processed on one connection, and then an
> > +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
> > +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
> > +  before the reply of the `NBD_CMD_FLUSH` is sent.
> > +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
> > +  connection
> > +* Multiple connections are not allowed
> > +
> > +In addition, clients using multiple connections SHOULD NOT send
> > +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
> > +the flush has not been replied to yet.
> > +
> 
> I don't think that should be a mandatory behaviour.

Which part of it?

> For once, it would
> be reasonably easy on gonbdserver but a PITA on the reference server.
> You'd need to put in IPC between each of the forked processes OR rely
> on fdatasync() only - I'm not sure that would necessarily work
> safely with (e.g.) the 'treefiles' / COW options.
> 
> I think better would be to say that the server MUST either
> 
> * Not support NBD_CMD_FLUSH at all

I think we should discourage not supporting FLUSH, rather than
suggesting it. 

> * Support NBD_CMD_FLUSH across channels (as you set out above), or
> * Indicate that it does not support multiple channels.

You dropped the one with no writes. I said "at most" there for a reason.
Originally I was going to say "if the server is read-only", but then
thought that it could work to do the "at most" thing. After having given
that some more thought, I now realize that if you write, you need to
flush across to other channels, regardless of whether they write too, so
that bit of it is moot now anyway.

Still, a server which exports read-only should still be safe for
multiple connections, even if there is no cache coherency (since
presumably nothing's going to change anyway).

[...]
> > The latter bit (on the client side) is because even if your backend has
> > no cache coherency issues, TCP does not guarantee ordering between
> > multiple connections. I don't know if the above is in line with what
> > blk-mq does, but consider the following scenario:
> > 
> > - A client sends two writes to the server, followed (immediately) by a
> >  flush, where at least the second write and the flush are not sent over
> >  the same connection.
> > - The first write is a small one, and it is handled almost immediately.
> > - The second write takes a little longer, so the flush is handled
> >  earlier than the second write
> > - The network packet containing the flush reply gets lost for whatever
> >  reason, so the client doesn't get it, and we fall into TCP
> >  retransmits.
> > - The second write finishes, and its reply header does not get lost
> > - After the second write reply reaches the client, the TCP retransmits
> >  for the flush reply are handled.
> > 
> > In the above scenario, the flush reply arrives on the client side after
> > a write reply which it did not cover; so the client will (incorrectly)
> > assume that the write has reached permanent storage when in fact it may
> > not have done so yet.
> > 
> > If the kernel does not care about the ordering of the two writes versus
> > the flush, then there is no problem. I don't know how blk-mq works in
> > that context, but if the above is a likely scenario, we may have to
> > reconsider adding blk-mq to nbd.
> 
> Actually I think this is a problem anyway. A simpler failure case is
> one where (by chance) one channel gets the writes, and one channel
> gets the flushes. The flush reply is delayed beyond the replies to
> unconnected writes (on the other channel) and hence the kernel thinks
> replied-to writes have been persisted when they have not been.

Yes, that is another example of essentially the same problem.

> The only way to fix that (as far as I can see) without changing flush
> semantics is for the block layer to issue flush requests on each
> channel when flushing on one channel.

Christoph just said that that doesn't (currently) happen; I don't know
whether the kernel currently already (optimistically) sends out flush
requests before the writes that it expects to hit permanent storage have
finished, but if it doesn't do that, then there is no problem and my
suggested bit of spec would be okay.

If there are good reasons to do so, however, we do indeed have a problem
and something else is necessary. I don't think flushing across all
connections is the best solution, though.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 16:27                               ` Wouter Verhelst
@ 2016-09-15 16:42                                 ` Alex Bligh
  -1 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 16:42 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, linux-block, Josef Bacik, linux-kernel,
	Christoph Hellwig, Markus Pargmann, kernel-team

Wouter,

> On 15 Sep 2016, at 17:27, Wouter Verhelst <w@uter.be> wrote:
>=20
> On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
>> Wouter,
>>=20
>>> The server can always refuse to allow multiple connections.
>>=20
>> Sure, but it would be neater to warn the client of that at =
negotiation
>> stage (it would only be one flag, e.g.  'multiple connections
>> unsafe').
>=20
> I suppose that's not a bad idea.

Good.

> [...]
>>> I was thinking of changing the spec as follows:
>>>=20
>>> diff --git a/doc/proto.md b/doc/proto.md
>>> index 217f57e..cb099e2 100644
>>> --- a/doc/proto.md
>>> +++ b/doc/proto.md
>>> @@ -308,6 +308,23 @@ specification, the
>>> [kernel =
documentation](https://www.kernel.org/doc/Documentation/block/writeback_ca=
che_control.txt)
>>> may be useful.
>>>=20
>>> +For performance reasons, clients MAY open multiple connections to =
the
>>> +same server. To support such clients, servers SHOULD ensure that at
>>> +least one of the following conditions hold:
>>> +
>>> +* Flush commands are processed for ALL connections. That is, when =
an
>>> +  `NBD_CMD_WRITE` is processed on one connection, and then an
>>> +  `NBD_CMD_FLUSH` is processed on another connection, the data of =
the
>>> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent =
storage
>>> +  before the reply of the `NBD_CMD_FLUSH` is sent.
>>> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most =
one
>>> +  connection
>>> +* Multiple connections are not allowed
>>> +
>>> +In addition, clients using multiple connections SHOULD NOT send
>>> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in =
relation to
>>> +the flush has not been replied to yet.
>>> +
>>=20
>> I don't think that should be a mandatory behaviour.
>=20
> Which part of it?

I've read it again :-) The wording was slightly contorted. I think
what I mean is that if you don't support flush at all, that's
another option.

The final paragraph I am not sure is right, as that's not what the =
kernel
currently does. If we are going to suggest a change in our main client's
behaviour, should we not just request that flush is done on all =
channels?

>> For once, it would
>> be reasonably easy on gonbdserver but a PITA on the reference server.
>> You'd need to put in IPC between each of the forked processes OR rely
>> on fdatasync() only - I'm not sure that would necessarily work
>> safely with (e.g.) the 'treefiles' / COW options.
>>=20
>> I think better would be to say that the server MUST either
>>=20
>> * Not support NBD_CMD_FLUSH at all
>=20
> I think we should discourage not supporting FLUSH, rather than
> suggesting it.=20

Sure, but some backends just don't support flush. For them, this
aspect at least is not a problem.

>> * Support NBD_CMD_FLUSH across channels (as you set out above), or
>> * Indicate that it does not support multiple channels.
>=20
> You dropped the one with no writes. I said "at most" there for a =
reason.
> Originally I was going to say "if the server is read-only", but then
> thought that it could work to do the "at most" thing. After having =
given
> that some more thought, I now realize that if you write, you need to
> flush across to other channels, regardless of whether they write too, =
so
> that bit of it is moot now anyway.
>=20
> Still, a server which exports read-only should still be safe for
> multiple connections, even if there is no cache coherency (since
> presumably nothing's going to change anyway).

Yes

>> Actually I think this is a problem anyway. A simpler failure case is
>> one where (by chance) one channel gets the writes, and one channel
>> gets the flushes. The flush reply is delayed beyond the replies to
>> unconnected writes (on the other channel) and hence the kernel thinks
>> replied-to writes have been persisted when they have not been.
>=20
> Yes, that is another example of essentially the same problem.

Yeah, I was just trying to simplify things.

>> The only way to fix that (as far as I can see) without changing flush
>> semantics is for the block layer to issue flush requests on each
>> channel when flushing on one channel.
>=20
> Christoph just said that that doesn't (currently) happen; I don't know
> whether the kernel currently already (optimistically) sends out flush
> requests before the writes that it expects to hit permanent storage =
have
> finished, but if it doesn't do that, then there is no problem and my
> suggested bit of spec would be okay.
>=20
> If there are good reasons to do so, however, we do indeed have a =
problem
> and something else is necessary. I don't think flushing across all
> connections is the best solution, though.

Well, the way I look at it is that we have a proposed change in
client behaviour (multiple channels) which causes problems at
least with flush and also (I think) with cache coherency (see other
email). We should either not make that change, or ensure other changes
are added which mitigate these issues.

Flush is actually the obvious one. Cache coherency is far more
subtle (though possibly fixable by something in the spec that
states that if multiple connections are supported, cache must
be coherent between them).

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 16:42                                 ` Alex Bligh
  0 siblings, 0 replies; 95+ messages in thread
From: Alex Bligh @ 2016-09-15 16:42 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, linux-block, Josef Bacik, linux-kernel,
	Christoph Hellwig, Markus Pargmann, kernel-team

Wouter,

> On 15 Sep 2016, at 17:27, Wouter Verhelst <w@uter.be> wrote:
> 
> On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
>> Wouter,
>> 
>>> The server can always refuse to allow multiple connections.
>> 
>> Sure, but it would be neater to warn the client of that at negotiation
>> stage (it would only be one flag, e.g.  'multiple connections
>> unsafe').
> 
> I suppose that's not a bad idea.

Good.

> [...]
>>> I was thinking of changing the spec as follows:
>>> 
>>> diff --git a/doc/proto.md b/doc/proto.md
>>> index 217f57e..cb099e2 100644
>>> --- a/doc/proto.md
>>> +++ b/doc/proto.md
>>> @@ -308,6 +308,23 @@ specification, the
>>> [kernel documentation](https://www.kernel.org/doc/Documentation/block/writeback_cache_control.txt)
>>> may be useful.
>>> 
>>> +For performance reasons, clients MAY open multiple connections to the
>>> +same server. To support such clients, servers SHOULD ensure that at
>>> +least one of the following conditions hold:
>>> +
>>> +* Flush commands are processed for ALL connections. That is, when an
>>> +  `NBD_CMD_WRITE` is processed on one connection, and then an
>>> +  `NBD_CMD_FLUSH` is processed on another connection, the data of the
>>> +  `NBD_CMD_WRITE` on the first connection MUST reach permanent storage
>>> +  before the reply of the `NBD_CMD_FLUSH` is sent.
>>> +* The server allows `NBD_CMD_WRITE` and `NBD_CMD_FLUSH` on at most one
>>> +  connection
>>> +* Multiple connections are not allowed
>>> +
>>> +In addition, clients using multiple connections SHOULD NOT send
>>> +`NBD_CMD_FLUSH` if an `NBD_CMD_WRITE` for which they care in relation to
>>> +the flush has not been replied to yet.
>>> +
>> 
>> I don't think that should be a mandatory behaviour.
> 
> Which part of it?

I've read it again :-) The wording was slightly contorted. I think
what I mean is that if you don't support flush at all, that's
another option.

The final paragraph I am not sure is right, as that's not what the kernel
currently does. If we are going to suggest a change in our main client's
behaviour, should we not just request that flush is done on all channels?

>> For once, it would
>> be reasonably easy on gonbdserver but a PITA on the reference server.
>> You'd need to put in IPC between each of the forked processes OR rely
>> on fdatasync() only - I'm not sure that would necessarily work
>> safely with (e.g.) the 'treefiles' / COW options.
>> 
>> I think better would be to say that the server MUST either
>> 
>> * Not support NBD_CMD_FLUSH at all
> 
> I think we should discourage not supporting FLUSH, rather than
> suggesting it. 

Sure, but some backends just don't support flush. For them, this
aspect at least is not a problem.

>> * Support NBD_CMD_FLUSH across channels (as you set out above), or
>> * Indicate that it does not support multiple channels.
> 
> You dropped the one with no writes. I said "at most" there for a reason.
> Originally I was going to say "if the server is read-only", but then
> thought that it could work to do the "at most" thing. After having given
> that some more thought, I now realize that if you write, you need to
> flush across to other channels, regardless of whether they write too, so
> that bit of it is moot now anyway.
> 
> Still, a server which exports read-only should still be safe for
> multiple connections, even if there is no cache coherency (since
> presumably nothing's going to change anyway).

Yes

>> Actually I think this is a problem anyway. A simpler failure case is
>> one where (by chance) one channel gets the writes, and one channel
>> gets the flushes. The flush reply is delayed beyond the replies to
>> unconnected writes (on the other channel) and hence the kernel thinks
>> replied-to writes have been persisted when they have not been.
> 
> Yes, that is another example of essentially the same problem.

Yeah, I was just trying to simplify things.

>> The only way to fix that (as far as I can see) without changing flush
>> semantics is for the block layer to issue flush requests on each
>> channel when flushing on one channel.
> 
> Christoph just said that that doesn't (currently) happen; I don't know
> whether the kernel currently already (optimistically) sends out flush
> requests before the writes that it expects to hit permanent storage have
> finished, but if it doesn't do that, then there is no problem and my
> suggested bit of spec would be okay.
> 
> If there are good reasons to do so, however, we do indeed have a problem
> and something else is necessary. I don't think flushing across all
> connections is the best solution, though.

Well, the way I look at it is that we have a proposed change in
client behaviour (multiple channels) which causes problems at
least with flush and also (I think) with cache coherency (see other
email). We should either not make that change, or ensure other changes
are added which mitigate these issues.

Flush is actually the obvious one. Cache coherency is far more
subtle (though possibly fixable by something in the spec that
states that if multiple connections are supported, cache must
be coherent between them).

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 16:27                               ` Wouter Verhelst
@ 2016-09-15 19:02                                 ` Eric Blake
  -1 siblings, 0 replies; 95+ messages in thread
From: Eric Blake @ 2016-09-15 19:02 UTC (permalink / raw)
  To: Wouter Verhelst, Alex Bligh
  Cc: nbd-general, Christoph Hellwig, Josef Bacik, linux-kernel,
	linux-block, mpa, kernel-team


[-- Attachment #1.1: Type: text/plain, Size: 708 bytes --]

On 09/15/2016 11:27 AM, Wouter Verhelst wrote:
> On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
>> Wouter,
>>
>>> The server can always refuse to allow multiple connections.
>>
>> Sure, but it would be neater to warn the client of that at negotiation
>> stage (it would only be one flag, e.g.  'multiple connections
>> unsafe').
> 
> I suppose that's not a bad idea.

Seems like it may need to be a per-export flag, rather than a global
flag (as a given server may be able to serve multiple types of files,
where the safety depends on the type of file being served).


-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 19:02                                 ` Eric Blake
  0 siblings, 0 replies; 95+ messages in thread
From: Eric Blake @ 2016-09-15 19:02 UTC (permalink / raw)
  To: Wouter Verhelst, Alex Bligh
  Cc: nbd-general, Christoph Hellwig, Josef Bacik, linux-kernel,
	linux-block, mpa, kernel-team


[-- Attachment #1.1: Type: text/plain, Size: 708 bytes --]

On 09/15/2016 11:27 AM, Wouter Verhelst wrote:
> On Thu, Sep 15, 2016 at 05:08:21PM +0100, Alex Bligh wrote:
>> Wouter,
>>
>>> The server can always refuse to allow multiple connections.
>>
>> Sure, but it would be neater to warn the client of that at negotiation
>> stage (it would only be one flag, e.g.  'multiple connections
>> unsafe').
> 
> I suppose that's not a bad idea.

Seems like it may need to be a per-export flag, rather than a global
flag (as a given server may be able to serve multiple types of files,
where the safety depends on the type of file being served).


-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
  2016-09-15 15:23             ` Alex Bligh
@ 2016-09-15 21:10               ` Paolo Bonzini
  -1 siblings, 0 replies; 95+ messages in thread
From: Paolo Bonzini @ 2016-09-15 21:10 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Eric Blake, Wouter Verhelst, Josef Bacik, linux-block,
	Markus Pargmann, kernel-team, nbd-general, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2337 bytes --]



On 15/09/2016 17:23, Alex Bligh wrote:
> Paolo,
> 
>> On 15 Sep 2016, at 15:07, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> I don't think QEMU forbids multiple clients to the single server, and
>> guarantees consistency as long as there is no overlap between writes and
>> reads.  These are the same guarantees you have for multiple commands on
>> a single connection.
>>
>> In other words, from the POV of QEMU there's no difference whether
>> multiple commands come from one or more connections.
> 
> This isn't really about ordering, it's about cache coherency
> and persisting things to disk.
> 
> What you say is correct as far as it goes in terms of ordering. However
> consider the scenario with read and writes on two channels as follows
> of the same block:
> 
>      Channel1     Channel2
> 
>      R                      Block read, and cached in user space in
>                             channel 1's cache
>                             Reply sent
> 
>                   W         New value written, channel 2's cache updated
>                             channel 1's cache not
> 
>      R                      Value returned from channel 1's cache.
> 
> 
> In the above scenario, there is a problem if the server(s) handling the
> two channels each use a read cache which is not coherent between the
> two channels. An example would be a read-through cache on a server that
> did fork() and shared no state between connections.

qemu-nbd does not fork(), so there is no coherency issue if W has replied.

However, if W hasn't replied, channel1 can get garbage.  Typically the
VM will be the one during writes, everyone else must be ready to handle
whatever mess the VM throws at them.

Paolo

> Similarly, if there is a write on channel 1 that has completed, and
> the flush goes to channel 2, it may not (if state is not shared) guarantee
> that the write on channel 1 (which has completed) is persisted to non-volatile
> media. Obviously if the 'state' is OS block cache/buffers/whatever, it
> will, but if it's (e.g.) a user-space per process write-through cache,
> it won't.
> 
> I don't know whether qemu-nbd is likely to suffer from either of these.

It can't happen.  On the other hand, channel1 must be ready to handle
garbage, it's illegal.





[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [Nbd] [RESEND][PATCH 0/5] nbd improvements
@ 2016-09-15 21:10               ` Paolo Bonzini
  0 siblings, 0 replies; 95+ messages in thread
From: Paolo Bonzini @ 2016-09-15 21:10 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Eric Blake, Wouter Verhelst, Josef Bacik, linux-block,
	Markus Pargmann, kernel-team, nbd-general, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2337 bytes --]



On 15/09/2016 17:23, Alex Bligh wrote:
> Paolo,
> 
>> On 15 Sep 2016, at 15:07, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> I don't think QEMU forbids multiple clients to the single server, and
>> guarantees consistency as long as there is no overlap between writes and
>> reads.  These are the same guarantees you have for multiple commands on
>> a single connection.
>>
>> In other words, from the POV of QEMU there's no difference whether
>> multiple commands come from one or more connections.
> 
> This isn't really about ordering, it's about cache coherency
> and persisting things to disk.
> 
> What you say is correct as far as it goes in terms of ordering. However
> consider the scenario with read and writes on two channels as follows
> of the same block:
> 
>      Channel1     Channel2
> 
>      R                      Block read, and cached in user space in
>                             channel 1's cache
>                             Reply sent
> 
>                   W         New value written, channel 2's cache updated
>                             channel 1's cache not
> 
>      R                      Value returned from channel 1's cache.
> 
> 
> In the above scenario, there is a problem if the server(s) handling the
> two channels each use a read cache which is not coherent between the
> two channels. An example would be a read-through cache on a server that
> did fork() and shared no state between connections.

qemu-nbd does not fork(), so there is no coherency issue if W has replied.

However, if W hasn't replied, channel1 can get garbage.  Typically the
VM will be the one during writes, everyone else must be ready to handle
whatever mess the VM throws at them.

Paolo

> Similarly, if there is a write on channel 1 that has completed, and
> the flush goes to channel 2, it may not (if state is not shared) guarantee
> that the write on channel 1 (which has completed) is persisted to non-volatile
> media. Obviously if the 'state' is OS block cache/buffers/whatever, it
> will, but if it's (e.g.) a user-space per process write-through cache,
> it won't.
> 
> I don't know whether qemu-nbd is likely to suffer from either of these.

It can't happen.  On the other hand, channel1 must be ready to handle
garbage, it's illegal.





[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

end of thread, other threads:[~2016-09-15 21:11 UTC | newest]

Thread overview: 95+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-08 21:12 [RESEND][PATCH 0/5] nbd improvements Josef Bacik
2016-09-08 21:12 ` [PATCH 1/5] nbd: convert to blkmq Josef Bacik
2016-09-08 21:12 ` [PATCH 2/5] nbd: don't shutdown sock with irq's disabled Josef Bacik
2016-09-08 21:12 ` [PATCH 3/5] nbd: use flags instead of bool Josef Bacik
2016-09-09  1:20   ` Joe Perches
2016-09-09 13:55     ` Jens Axboe
2016-09-09 16:04       ` Joe Perches
2016-09-09 16:04         ` Joe Perches
2016-09-09 16:11         ` Jens Axboe
2016-09-09 16:15           ` Joe Perches
2016-09-09 16:20             ` Jens Axboe
2016-09-09 16:20               ` Jens Axboe
2016-09-08 21:12 ` [PATCH 4/5] nbd: allow block mq to deal with timeouts Josef Bacik
2016-09-08 21:12 ` [PATCH 5/5] nbd: add multi-connection support Josef Bacik
2016-09-10  7:43   ` Christoph Hellwig
2016-09-12 13:11     ` Josef Bacik
2016-09-09 20:02 ` [Nbd] [RESEND][PATCH 0/5] nbd improvements Wouter Verhelst
2016-09-09 20:36   ` Josef Bacik
2016-09-09 20:55     ` Wouter Verhelst
2016-09-09 23:00       ` Josef Bacik
2016-09-09 23:37         ` Jens Axboe
2016-09-15 10:49   ` Wouter Verhelst
2016-09-15 11:09     ` Alex Bligh
2016-09-15 11:09       ` Alex Bligh
2016-09-15 11:29       ` Wouter Verhelst
2016-09-15 11:29         ` Wouter Verhelst
2016-09-15 11:40         ` Christoph Hellwig
2016-09-15 11:40           ` Christoph Hellwig
2016-09-15 11:46           ` Alex Bligh
2016-09-15 11:46             ` Alex Bligh
2016-09-15 11:52             ` Christoph Hellwig
2016-09-15 11:52               ` Christoph Hellwig
2016-09-15 12:01               ` Wouter Verhelst
2016-09-15 12:01                 ` Wouter Verhelst
2016-09-15 12:20                 ` Christoph Hellwig
2016-09-15 12:20                   ` Christoph Hellwig
2016-09-15 12:26                   ` Wouter Verhelst
2016-09-15 12:26                     ` Wouter Verhelst
2016-09-15 12:27                     ` Christoph Hellwig
2016-09-15 12:27                       ` Christoph Hellwig
2016-09-15 12:04               ` Alex Bligh
2016-09-15 12:04                 ` Alex Bligh
2016-09-15 11:39       ` Christoph Hellwig
2016-09-15 11:39         ` Christoph Hellwig
2016-09-15 13:34       ` Eric Blake
2016-09-15 13:34         ` Eric Blake
2016-09-15 14:07         ` Paolo Bonzini
2016-09-15 14:07           ` Paolo Bonzini
2016-09-15 15:23           ` Alex Bligh
2016-09-15 15:23             ` Alex Bligh
2016-09-15 21:10             ` Paolo Bonzini
2016-09-15 21:10               ` Paolo Bonzini
2016-09-15 15:25         ` Alex Bligh
2016-09-15 15:25           ` Alex Bligh
2016-09-15 11:38     ` Christoph Hellwig
2016-09-15 11:43       ` Alex Bligh
2016-09-15 11:43         ` Alex Bligh
2016-09-15 11:46         ` Christoph Hellwig
2016-09-15 11:46           ` Christoph Hellwig
2016-09-15 11:56           ` Alex Bligh
2016-09-15 11:56             ` Alex Bligh
2016-09-15 11:55       ` Wouter Verhelst
2016-09-15 12:01         ` Christoph Hellwig
2016-09-15 12:11           ` Alex Bligh
2016-09-15 12:11             ` Alex Bligh
2016-09-15 12:18             ` Christoph Hellwig
2016-09-15 12:18               ` Christoph Hellwig
2016-09-15 12:28               ` Alex Bligh
2016-09-15 12:28                 ` Alex Bligh
2016-09-15 12:21           ` Wouter Verhelst
2016-09-15 12:23             ` Christoph Hellwig
2016-09-15 12:33               ` Alex Bligh
2016-09-15 12:33                 ` Alex Bligh
2016-09-15 12:36                 ` Christoph Hellwig
2016-09-15 12:36                   ` Christoph Hellwig
2016-09-15 12:39                   ` Alex Bligh
2016-09-15 12:39                     ` Alex Bligh
2016-09-15 12:41                     ` Christoph Hellwig
2016-09-15 12:41                       ` Christoph Hellwig
2016-09-15 12:44                       ` Alex Bligh
2016-09-15 12:44                         ` Alex Bligh
2016-09-15 13:17                         ` Wouter Verhelst
2016-09-15 13:17                           ` Wouter Verhelst
2016-09-15 13:57                           ` Josef Bacik
2016-09-15 13:57                             ` Josef Bacik
2016-09-15 15:17                             ` Alex Bligh
2016-09-15 15:17                               ` Alex Bligh
2016-09-15 16:08                           ` Alex Bligh
2016-09-15 16:08                             ` Alex Bligh
2016-09-15 16:27                             ` Wouter Verhelst
2016-09-15 16:27                               ` Wouter Verhelst
2016-09-15 16:42                               ` Alex Bligh
2016-09-15 16:42                                 ` Alex Bligh
2016-09-15 19:02                               ` Eric Blake
2016-09-15 19:02                                 ` Eric Blake

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.