All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][V3] nbd: add multi-connection support
@ 2016-09-28 20:01 Josef Bacik
  2016-09-29  9:52 ` [Nbd] " Wouter Verhelst
  2016-10-11  9:00 ` Sagi Grimberg
  0 siblings, 2 replies; 44+ messages in thread
From: Josef Bacik @ 2016-09-28 20:01 UTC (permalink / raw)
  To: axboe, linux-block, linux-kernel, kernel-team, nbd-general, w

NBD can become contended on its single connection.  We have to serialize all
writes and we can only process one read response at a time.  Fix this by
allowing userspace to provide multiple connections to a single nbd device.  This
coupled with block-mq drastically increases performance in multi-process cases.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
V2->V3:
-Fixed a problem with the tag used for the requests.
-Rebased onto the patch that enables async submit.

V1->V2:
-Dropped the index from nbd_cmd and just used the hctx->queue_num as HCH
 suggested
-Added the pid attribute back to the /sys/block/nbd*/ directory for the recv
 pid.
-Reworked the disconnect to simply send the command on all connections instead
 of sending a special command through the block layer.
-Fixed some of the disconnect handling to be less verbose when we specifically
 request a disconnect.

 drivers/block/nbd.c | 358 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 217 insertions(+), 141 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ccfcfc1..30f4f58 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,26 +41,36 @@
 
 #include <linux/nbd.h>
 
+struct nbd_sock {
+	struct socket *sock;
+	struct mutex tx_lock;
+};
+
 #define NBD_TIMEDOUT			0
 #define NBD_DISCONNECT_REQUESTED	1
+#define NBD_DISCONNECTED		2
+#define NBD_RUNNING			3
 
 struct nbd_device {
 	u32 flags;
 	unsigned long runtime_flags;
-	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
+	struct nbd_sock **socks;
 	int magic;
 
 	struct blk_mq_tag_set tag_set;
 
-	struct mutex tx_lock;
+	struct mutex config_lock;
 	struct gendisk *disk;
+	int num_connections;
+	atomic_t recv_threads;
+	wait_queue_head_t recv_wq;
 	int blksize;
 	loff_t bytesize;
 
 	/* protects initialization and shutdown of the socket */
 	spinlock_t sock_lock;
 	struct task_struct *task_recv;
-	struct task_struct *task_send;
+	struct task_struct *task_setup;
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *dbg_dir;
@@ -69,7 +79,6 @@ struct nbd_device {
 
 struct nbd_cmd {
 	struct nbd_device *nbd;
-	struct list_head list;
 };
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -159,22 +168,20 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
-	struct socket *sock;
-
-	spin_lock(&nbd->sock_lock);
+	int i;
 
-	if (!nbd->sock) {
-		spin_unlock_irq(&nbd->sock_lock);
+	if (nbd->num_connections == 0)
+		return;
+	if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
 		return;
-	}
-
-	sock = nbd->sock;
-	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
-	nbd->sock = NULL;
-	spin_unlock(&nbd->sock_lock);
 
-	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sockfd_put(sock);
+	for (i = 0; i < nbd->num_connections; i++) {
+		struct nbd_sock *nsock = nbd->socks[i];
+		mutex_lock(&nsock->tx_lock);
+		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+		mutex_unlock(&nsock->tx_lock);
+	}
+	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 }
 
 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
@@ -182,35 +189,31 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct nbd_device *nbd = cmd->nbd;
-	struct socket *sock = NULL;
-
-	spin_lock(&nbd->sock_lock);
 
+	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
-
-	if (nbd->sock) {
-		sock = nbd->sock;
-		get_file(sock->file);
-	}
-
-	spin_unlock(&nbd->sock_lock);
-	if (sock) {
-		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sockfd_put(sock);
-	}
-
 	req->errors++;
-	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+
+	/*
+	 * If our disconnect packet times out then we're already holding the
+	 * config_lock and could deadlock here, so just set an error and return,
+	 * we'll handle shutting everything down later.
+	 */
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		return BLK_EH_HANDLED;
+	mutex_lock(&nbd->config_lock);
+	sock_shutdown(nbd);
+	mutex_unlock(&nbd->config_lock);
 	return BLK_EH_HANDLED;
 }
 
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
-		int msg_flags)
+static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
+		     int size, int msg_flags)
 {
-	struct socket *sock = nbd->sock;
+	struct socket *sock = nbd->socks[index]->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
@@ -254,29 +257,28 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
-		int flags)
+static inline int sock_send_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec, int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+	result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset,
 			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
 	u32 type;
+	u32 tag = blk_mq_unique_tag(req);
 
-	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-		type = NBD_CMD_DISC;
-	else if (req_op(req) == REQ_OP_DISCARD)
+	if (req_op(req) == REQ_OP_DISCARD)
 		type = NBD_CMD_TRIM;
 	else if (req_op(req) == REQ_OP_FLUSH)
 		type = NBD_CMD_FLUSH;
@@ -288,16 +290,16 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	memset(&request, 0, sizeof(request));
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(type);
-	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
+	if (type != NBD_CMD_FLUSH) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
-	memcpy(request.handle, &req->tag, sizeof(req->tag));
+	memcpy(request.handle, &tag, sizeof(tag));
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
-	result = sock_xmit(nbd, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, index, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
@@ -318,7 +320,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 				cmd, bvec.bv_len);
-			result = sock_send_bvec(nbd, &bvec, flags);
+			result = sock_send_bvec(nbd, index, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -330,31 +332,34 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	return 0;
 }
 
-static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
-			MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset,
+			   bvec->bv_len, MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 {
 	int result;
 	struct nbd_reply reply;
 	struct nbd_cmd *cmd;
 	struct request *req = NULL;
 	u16 hwq;
-	int tag;
+	u32 tag;
 
 	reply.magic = 0;
-	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
-		dev_err(disk_to_dev(nbd->disk),
-			"Receive control failed (result %d)\n", result);
+		if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
+		    !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+			dev_err(disk_to_dev(nbd->disk),
+				"Receive control failed (result %d)\n", result);
 		return ERR_PTR(result);
 	}
 
@@ -364,7 +369,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		return ERR_PTR(-EPROTO);
 	}
 
-	memcpy(&tag, reply.handle, sizeof(int));
+	memcpy(&tag, reply.handle, sizeof(u32));
 
 	hwq = blk_mq_unique_tag_to_hwq(tag);
 	if (hwq < nbd->tag_set.nr_hw_queues)
@@ -390,7 +395,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		struct bio_vec bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(nbd, &bvec);
+			result = sock_recv_bvec(nbd, index, &bvec);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
@@ -418,25 +423,24 @@ static struct device_attribute pid_attr = {
 	.show = pid_show,
 };
 
-static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
+struct recv_thread_args {
+	struct work_struct work;
+	struct nbd_device *nbd;
+	int index;
+};
+
+static void recv_work(struct work_struct *work)
 {
+	struct recv_thread_args *args = container_of(work,
+						     struct recv_thread_args,
+						     work);
+	struct nbd_device *nbd = args->nbd;
 	struct nbd_cmd *cmd;
-	int ret;
+	int ret = 0;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
-
-	sk_set_memalloc(nbd->sock->sk);
-
-	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
-	if (ret) {
-		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-		return ret;
-	}
-
-	nbd_size_update(nbd, bdev);
-
 	while (1) {
-		cmd = nbd_read_stat(nbd);
+		cmd = nbd_read_stat(nbd, args->index);
 		if (IS_ERR(cmd)) {
 			ret = PTR_ERR(cmd);
 			break;
@@ -445,10 +449,14 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 		nbd_end_request(cmd);
 	}
 
-	nbd_size_clear(nbd, bdev);
-
-	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-	return ret;
+	/*
+	 * We got an error, shut everybody down if this wasn't the result of a
+	 * disconnect request.
+	 */
+	if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+		sock_shutdown(nbd);
+	atomic_dec(&nbd->recv_threads);
+	wake_up(&nbd->recv_wq);
 }
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -466,26 +474,35 @@ static void nbd_clear_que(struct nbd_device *nbd)
 {
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	/*
-	 * Because we have set nbd->sock to NULL under the tx_lock, all
-	 * modifications to the list must have completed by now.
-	 */
-	BUG_ON(nbd->sock);
-
 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
 
-static void nbd_handle_cmd(struct nbd_cmd *cmd)
+static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	struct nbd_device *nbd = cmd->nbd;
+	struct nbd_sock *nsock;
 
-	if (req->cmd_type != REQ_TYPE_FS)
+	if (index >= nbd->num_connections) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on invalid socket\n");
 		goto error_out;
+	}
+
+	if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on closed socket\n");
+		goto error_out;
+	}
 
-	if (rq_data_dir(req) == WRITE &&
+	if (req->cmd_type != REQ_TYPE_FS &&
+	    req->cmd_type != REQ_TYPE_DRV_PRIV)
+		goto error_out;
+
+	if (req->cmd_type == REQ_TYPE_FS &&
+	    rq_data_dir(req) == WRITE &&
 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Write on read-only\n");
@@ -494,23 +511,22 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 
 	req->errors = 0;
 
-	mutex_lock(&nbd->tx_lock);
-	nbd->task_send = current;
-	if (unlikely(!nbd->sock)) {
-		mutex_unlock(&nbd->tx_lock);
+	nsock = nbd->socks[index];
+	mutex_lock(&nsock->tx_lock);
+	if (unlikely(!nsock->sock)) {
+		mutex_unlock(&nsock->tx_lock);
 		dev_err(disk_to_dev(nbd->disk),
 			"Attempted send on closed socket\n");
 		goto error_out;
 	}
 
-	if (nbd_send_cmd(nbd, cmd) != 0) {
+	if (nbd_send_cmd(nbd, cmd, index) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
 		nbd_end_request(cmd);
 	}
 
-	nbd->task_send = NULL;
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nsock->tx_lock);
 
 	return;
 
@@ -525,38 +541,57 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
 	blk_mq_start_request(bd->rq);
-	nbd_handle_cmd(cmd);
+	nbd_handle_cmd(cmd, hctx->queue_num);
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
+static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
 {
-	int ret = 0;
+	struct nbd_sock **socks;
+	struct nbd_sock *nsock;
 
-	spin_lock_irq(&nbd->sock_lock);
-
-	if (nbd->sock) {
-		ret = -EBUSY;
-		goto out;
+	if (!nbd->task_setup)
+		nbd->task_setup = current;
+	if (nbd->task_setup != current) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Device being setup by another task");
+		return -EINVAL;
 	}
 
-	nbd->sock = sock;
+	socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+			 sizeof(struct nbd_sock *), GFP_KERNEL);
+	if (!socks)
+		return -ENOMEM;
+	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
+	if (!nsock)
+		return -ENOMEM;
+
+	nbd->socks = socks;
 
-out:
-	spin_unlock_irq(&nbd->sock_lock);
+	mutex_init(&nsock->tx_lock);
+	nsock->sock = sock;
+	socks[nbd->num_connections++] = nsock;
 
-	return ret;
+	return 0;
 }
 
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
+	int i;
+
+	for (i = 0; i < nbd->num_connections; i++)
+		kfree(nbd->socks[i]);
+	kfree(nbd->socks);
+	nbd->socks = NULL;
 	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
 	nbd->tag_set.timeout = 0;
+	nbd->num_connections = 0;
+	nbd->task_setup = NULL;
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
 
@@ -582,48 +617,67 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
 		blk_queue_write_cache(nbd->disk->queue, false, false);
 }
 
+static void send_disconnects(struct nbd_device *nbd)
+{
+	struct nbd_request request = {};
+	int i, ret;
+
+	request.magic = htonl(NBD_REQUEST_MAGIC);
+	request.type = htonl(NBD_CMD_DISC);
+
+	for (i = 0; i < nbd->num_connections; i++) {
+		ret = sock_xmit(nbd, i, 1, &request, sizeof(request), 0);
+		if (ret <= 0)
+			dev_err(disk_to_dev(nbd->disk),
+				"Send disconnect failed %d\n", ret);
+	}
+}
+
 static int nbd_dev_dbg_init(struct nbd_device *nbd);
 static void nbd_dev_dbg_close(struct nbd_device *nbd);
 
-/* Must be called with tx_lock held */
-
+/* Must be called with config_lock held */
 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
-		struct request *sreq;
-
 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
 
-		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
-		if (!sreq)
-			return -ENOMEM;
-
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 		fsync_bdev(bdev);
-		mutex_lock(&nbd->tx_lock);
-		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
+		mutex_lock(&nbd->config_lock);
 
 		/* Check again after getting mutex back.  */
-		if (!nbd->sock) {
-			blk_mq_free_request(sreq);
+		if (!nbd->socks)
 			return -EINVAL;
-		}
-
-		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
 
-		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
-		blk_mq_free_request(sreq);
+		if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
+				      &nbd->runtime_flags))
+			send_disconnects(nbd);
 		return 0;
 	}
- 
+
 	case NBD_CLEAR_SOCK:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
 		kill_bdev(bdev);
+		nbd_bdev_reset(bdev);
+		/*
+		 * We want to give the run thread a chance to wait for everybody
+		 * to clean up and then do it's own cleanup.
+		 */
+		if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
+			int i;
+
+			for (i = 0; i < nbd->num_connections; i++)
+				kfree(nbd->socks[i]);
+			kfree(nbd->socks);
+			nbd->socks = NULL;
+			nbd->num_connections = 0;
+		}
 		return 0;
 
 	case NBD_SET_SOCK: {
@@ -633,7 +687,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!sock)
 			return err;
 
-		err = nbd_set_socket(nbd, sock);
+		err = nbd_add_socket(nbd, sock);
 		if (!err && max_part)
 			bdev->bd_invalidated = 1;
 
@@ -662,26 +716,53 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_DO_IT: {
-		int error;
+		struct recv_thread_args *args;
+		int num_connections = nbd->num_connections;
+		int error, i;
 
 		if (nbd->task_recv)
 			return -EBUSY;
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
 
-		/* We have to claim the device under the lock */
+		set_bit(NBD_RUNNING, &nbd->runtime_flags);
+		blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
+		args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
+		if (!args)
+			goto out_err;
 		nbd->task_recv = current;
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 
 		nbd_parse_flags(nbd, bdev);
 
+		error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+		if (error) {
+			dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+			goto out_recv;
+		}
+
+		nbd_size_update(nbd, bdev);
+
 		nbd_dev_dbg_init(nbd);
-		error = nbd_thread_recv(nbd, bdev);
+		for (i = 0; i < num_connections; i++) {
+			sk_set_memalloc(nbd->socks[i]->sock->sk);
+			atomic_inc(&nbd->recv_threads);
+			INIT_WORK(&args[i].work, recv_work);
+			args[i].nbd = nbd;
+			args[i].index = i;
+			queue_work(system_long_wq, &args[i].work);
+		}
+		wait_event_interruptible(nbd->recv_wq,
+					 atomic_read(&nbd->recv_threads) == 0);
+		for (i = 0; i < num_connections; i++)
+			flush_work(&args[i].work);
 		nbd_dev_dbg_close(nbd);
-
-		mutex_lock(&nbd->tx_lock);
+		nbd_size_clear(nbd, bdev);
+		device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+out_recv:
+		mutex_lock(&nbd->config_lock);
 		nbd->task_recv = NULL;
-
+out_err:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
 		kill_bdev(bdev);
@@ -694,7 +775,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			error = -ETIMEDOUT;
 
 		nbd_reset(nbd);
-
 		return error;
 	}
 
@@ -726,9 +806,9 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	mutex_lock(&nbd->tx_lock);
+	mutex_lock(&nbd->config_lock);
 	error = __nbd_ioctl(bdev, nbd, cmd, arg);
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nbd->config_lock);
 
 	return error;
 }
@@ -748,8 +828,6 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 
 	if (nbd->task_recv)
 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
-	if (nbd->task_send)
-		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
 
 	return 0;
 }
@@ -873,9 +951,7 @@ static int nbd_init_request(void *data, struct request *rq,
 			    unsigned int numa_node)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
 	cmd->nbd = data;
-	INIT_LIST_HEAD(&cmd->list);
 	return 0;
 }
 
@@ -986,13 +1062,13 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].magic = NBD_MAGIC;
-		spin_lock_init(&nbd_dev[i].sock_lock);
-		mutex_init(&nbd_dev[i].tx_lock);
+		mutex_init(&nbd_dev[i].config_lock);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
 		disk->private_data = &nbd_dev[i];
 		sprintf(disk->disk_name, "nbd%d", i);
+		init_waitqueue_head(&nbd_dev[i].recv_wq);
 		nbd_reset(&nbd_dev[i]);
 		add_disk(disk);
 	}
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-09-28 20:01 [PATCH][V3] nbd: add multi-connection support Josef Bacik
@ 2016-09-29  9:52 ` Wouter Verhelst
  2016-09-29 14:03   ` Josef Bacik
  2016-10-11  9:00 ` Sagi Grimberg
  1 sibling, 1 reply; 44+ messages in thread
From: Wouter Verhelst @ 2016-09-29  9:52 UTC (permalink / raw)
  To: Josef Bacik; +Cc: axboe, linux-block, linux-kernel, kernel-team, nbd-general

Hi Josef,

On Wed, Sep 28, 2016 at 04:01:32PM -0400, Josef Bacik wrote:
> NBD can become contended on its single connection.  We have to serialize all
> writes and we can only process one read response at a time.  Fix this by
> allowing userspace to provide multiple connections to a single nbd device.  This
> coupled with block-mq drastically increases performance in multi-process cases.
> Thanks,

This reminds me: I've been pondering this for a while, and I think there
is no way we can guarantee the correct ordering of FLUSH replies in the
face of multiple connections, since a WRITE reply on one connection may
arrive before a FLUSH reply on another which it does not cover, even if
the server has no cache coherency issues otherwise.

Having said that, there can certainly be cases where that is not a
problem, and where performance considerations are more important than
reliability guarantees; so once this patch lands in the kernel (and the
necessary support patch lands in the userland utilities), I think I'll
just update the documentation to mention the problems that might ensue,
and be done with it.

I can see only a few ways in which to potentially solve this problem:
- Kernel-side nbd-client could send a FLUSH command over every channel,
  and only report successful completion once all replies have been
  received. This might negate some of the performance benefits, however.
- Multiplexing commands over a single connection (perhaps an SCTP one,
  rather than TCP); this would require some effort though, as you said,
  and would probably complicate the protocol significantly.

Regards,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-09-29  9:52 ` [Nbd] " Wouter Verhelst
@ 2016-09-29 14:03   ` Josef Bacik
  2016-09-29 16:41     ` Wouter Verhelst
  0 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2016-09-29 14:03 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: axboe, linux-block, linux-kernel, kernel-team, nbd-general

On 09/29/2016 05:52 AM, Wouter Verhelst wrote:
> Hi Josef,
>
> On Wed, Sep 28, 2016 at 04:01:32PM -0400, Josef Bacik wrote:
>> NBD can become contended on its single connection.  We have to serialize all
>> writes and we can only process one read response at a time.  Fix this by
>> allowing userspace to provide multiple connections to a single nbd device.  This
>> coupled with block-mq drastically increases performance in multi-process cases.
>> Thanks,
>
> This reminds me: I've been pondering this for a while, and I think there
> is no way we can guarantee the correct ordering of FLUSH replies in the
> face of multiple connections, since a WRITE reply on one connection may
> arrive before a FLUSH reply on another which it does not cover, even if
> the server has no cache coherency issues otherwise.
>
> Having said that, there can certainly be cases where that is not a
> problem, and where performance considerations are more important than
> reliability guarantees; so once this patch lands in the kernel (and the
> necessary support patch lands in the userland utilities), I think I'll
> just update the documentation to mention the problems that might ensue,
> and be done with it.
>
> I can see only a few ways in which to potentially solve this problem:
> - Kernel-side nbd-client could send a FLUSH command over every channel,
>   and only report successful completion once all replies have been
>   received. This might negate some of the performance benefits, however.
> - Multiplexing commands over a single connection (perhaps an SCTP one,
>   rather than TCP); this would require some effort though, as you said,
>   and would probably complicate the protocol significantly.
>

So think of it like normal disks with multiple channels.  We don't send flushes 
down all the hwq's to make sure they are clear, we leave that decision up to the 
application (usually a FS of course).  So what we're doing here is no worse than 
what every real disk on the planet does, our hw queues are just have a lot 
longer transfer times and are more error prone ;).  I definitely think 
documenting the behavior is important so that people don't expect magic to 
happen, and perhaps we could add a flag later that says send all the flushes 
down all the connections for the paranoid, it should be relatively 
straightforward to do.  Thanks,

Josef


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-09-29 14:03   ` Josef Bacik
@ 2016-09-29 16:41     ` Wouter Verhelst
  2016-09-29 16:59       ` Josef Bacik
  0 siblings, 1 reply; 44+ messages in thread
From: Wouter Verhelst @ 2016-09-29 16:41 UTC (permalink / raw)
  To: Josef Bacik; +Cc: axboe, linux-block, kernel-team, nbd-general, linux-kernel

On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
> So think of it like normal disks with multiple channels.  We don't send flushes 
> down all the hwq's to make sure they are clear, we leave that decision up to the 
> application (usually a FS of course).

Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
when a FLUSH is sent over one channel, and the reply comes in, that all
commands which have been received, regardless of which channel they were
received over, have reched disk.

[1] Message-ID: <20160915122304.GA15501@infradead.org>

It is impossible for nbd to make such a guarantee, due to head-of-line
blocking on TCP.

[...]
> perhaps we could add a flag later that says send all the flushes down
> all the connections for the paranoid, it should be relatively
> straightforward to do.  Thanks,

That's not an unreasonable approach, I guess.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-09-29 16:41     ` Wouter Verhelst
@ 2016-09-29 16:59       ` Josef Bacik
  2016-10-02 16:17           ` Alex Bligh
  0 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2016-09-29 16:59 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: axboe, linux-block, kernel-team, nbd-general, linux-kernel

On 09/29/2016 12:41 PM, Wouter Verhelst wrote:
> On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
>> So think of it like normal disks with multiple channels.  We don't send flushes
>> down all the hwq's to make sure they are clear, we leave that decision up to the
>> application (usually a FS of course).
>
> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
> when a FLUSH is sent over one channel, and the reply comes in, that all
> commands which have been received, regardless of which channel they were
> received over, have reched disk.
>
> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>
> It is impossible for nbd to make such a guarantee, due to head-of-line
> blocking on TCP.
>

Huh I missed that.  Yeah that's not possible for us for sure, I think my option 
idea is the less awful way forward if we want to address that limitation.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-09-29 16:59       ` Josef Bacik
@ 2016-10-02 16:17           ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-02 16:17 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, axboe, linux-block, kernel-team,
	nbd-general, linux-kernel


> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
>=20
> On 09/29/2016 12:41 PM, Wouter Verhelst wrote:
>> On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
>>> So think of it like normal disks with multiple channels.  We don't =
send flushes
>>> down all the hwq's to make sure they are clear, we leave that =
decision up to the
>>> application (usually a FS of course).
>>=20
>> Well, when I asked earlier, Christoph said[1] that blk-mq assumes =
that
>> when a FLUSH is sent over one channel, and the reply comes in, that =
all
>> commands which have been received, regardless of which channel they =
were
>> received over, have reched disk.
>>=20
>> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>>=20
>> It is impossible for nbd to make such a guarantee, due to =
head-of-line
>> blocking on TCP.
>>=20
>=20
> Huh I missed that.  Yeah that's not possible for us for sure, I think =
my option=20
> idea is the less awful way forward if we want to address that =
limitation.  Thanks,

I think if the server supports flush (which you can tell), sending flush =
on
all channels is the only safe thing to do, without substantial protocol
changes (which I'm not sure how one would do given flush is in a sense
a synchronisation point). I think it's thus imperative this gets fixed
before the change gets merged.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-02 16:17           ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-02 16:17 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, axboe, linux-block, kernel-team,
	nbd-general, linux-kernel


> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
> 
> On 09/29/2016 12:41 PM, Wouter Verhelst wrote:
>> On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
>>> So think of it like normal disks with multiple channels.  We don't send flushes
>>> down all the hwq's to make sure they are clear, we leave that decision up to the
>>> application (usually a FS of course).
>> 
>> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
>> when a FLUSH is sent over one channel, and the reply comes in, that all
>> commands which have been received, regardless of which channel they were
>> received over, have reched disk.
>> 
>> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>> 
>> It is impossible for nbd to make such a guarantee, due to head-of-line
>> blocking on TCP.
>> 
> 
> Huh I missed that.  Yeah that's not possible for us for sure, I think my option 
> idea is the less awful way forward if we want to address that limitation.  Thanks,

I think if the server supports flush (which you can tell), sending flush on
all channels is the only safe thing to do, without substantial protocol
changes (which I'm not sure how one would do given flush is in a sense
a synchronisation point). I think it's thus imperative this gets fixed
before the change gets merged.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-02 16:17           ` Alex Bligh
@ 2016-10-03  1:47             ` Josef Bacik
  -1 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2016-10-03  1:47 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Wouter Verhelst, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel


> On Oct 2, 2016, at 12:17 PM, Alex Bligh <alex@alex.org.uk> wrote:
>=20
>=20
>> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
>>=20
>> On 09/29/2016 12:41 PM, Wouter Verhelst wrote:
>>>> On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
>>>> So think of it like normal disks with multiple channels.  We don't sen=
d flushes
>>>> down all the hwq's to make sure they are clear, we leave that decision=
 up to the
>>>> application (usually a FS of course).
>>>=20
>>> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
>>> when a FLUSH is sent over one channel, and the reply comes in, that all
>>> commands which have been received, regardless of which channel they wer=
e
>>> received over, have reched disk.
>>>=20
>>> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>>>=20
>>> It is impossible for nbd to make such a guarantee, due to head-of-line
>>> blocking on TCP.
>>>=20
>>=20
>> Huh I missed that.  Yeah that's not possible for us for sure, I think my=
 option=20
>> idea is the less awful way forward if we want to address that limitation=
.  Thanks,
>=20
> I think if the server supports flush (which you can tell), sending flush =
on
> all channels is the only safe thing to do, without substantial protocol
> changes (which I'm not sure how one would do given flush is in a sense
> a synchronisation point). I think it's thus imperative this gets fixed
> before the change gets merged.

It's not "broken", it's working as designed, and any fs on top of this patc=
h will be perfectly safe because they all wait for their io to complete bef=
ore issuing the FLUSH.  If somebody wants to address the paranoid case late=
r then all the power to them, but this works for my use case and isn't inhe=
rently broken.  If it doesn't work for yours then don't use the feature, it=
's that simple.  Thanks,

Josef=

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03  1:47             ` Josef Bacik
  0 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2016-10-03  1:47 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Wouter Verhelst, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel


> On Oct 2, 2016, at 12:17 PM, Alex Bligh <alex@alex.org.uk> wrote:
> 
> 
>> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
>> 
>> On 09/29/2016 12:41 PM, Wouter Verhelst wrote:
>>>> On Thu, Sep 29, 2016 at 10:03:50AM -0400, Josef Bacik wrote:
>>>> So think of it like normal disks with multiple channels.  We don't send flushes
>>>> down all the hwq's to make sure they are clear, we leave that decision up to the
>>>> application (usually a FS of course).
>>> 
>>> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
>>> when a FLUSH is sent over one channel, and the reply comes in, that all
>>> commands which have been received, regardless of which channel they were
>>> received over, have reched disk.
>>> 
>>> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>>> 
>>> It is impossible for nbd to make such a guarantee, due to head-of-line
>>> blocking on TCP.
>>> 
>> 
>> Huh I missed that.  Yeah that's not possible for us for sure, I think my option 
>> idea is the less awful way forward if we want to address that limitation.  Thanks,
> 
> I think if the server supports flush (which you can tell), sending flush on
> all channels is the only safe thing to do, without substantial protocol
> changes (which I'm not sure how one would do given flush is in a sense
> a synchronisation point). I think it's thus imperative this gets fixed
> before the change gets merged.

It's not "broken", it's working as designed, and any fs on top of this patch will be perfectly safe because they all wait for their io to complete before issuing the FLUSH.  If somebody wants to address the paranoid case later then all the power to them, but this works for my use case and isn't inherently broken.  If it doesn't work for yours then don't use the feature, it's that simple.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03  1:47             ` Josef Bacik
@ 2016-10-03  7:20               ` Christoph Hellwig
  -1 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-03  7:20 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, Jens Axboe, linux-block,
	Kernel Team, nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 01:47:06AM +0000, Josef Bacik wrote:
> It's not "broken", it's working as designed, and any fs on top of this patch will be perfectly safe because they all wait for their io to complete before issuing the FLUSH.  If somebody wants to address the paranoid case later then all the power to them, but this works for my use case and isn't inherently broken.  If it doesn't work for yours then don't use the feature, it's that simple.  Thanks,

Let's take one step back here.  I agree with Josef that sending
one single flush is perfectly fine for all usual cases.  The issue
that was brought up last time we had this discussion was that some
(I think mostly theoretical) backends could not be coherent and
this would be an issue.

So maybe the right way is to simply not support the current odd flush
defintion in the kernel then and require a new properly defined flush
version instead.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03  7:20               ` Christoph Hellwig
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-03  7:20 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Wouter Verhelst, Jens Axboe, linux-block,
	Kernel Team, nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 01:47:06AM +0000, Josef Bacik wrote:
> It's not "broken", it's working as designed, and any fs on top of this patch will be perfectly safe because they all wait for their io to complete before issuing the FLUSH.  If somebody wants to address the paranoid case later then all the power to them, but this works for my use case and isn't inherently broken.  If it doesn't work for yours then don't use the feature, it's that simple.  Thanks,

Let's take one step back here.  I agree with Josef that sending
one single flush is perfectly fine for all usual cases.  The issue
that was brought up last time we had this discussion was that some
(I think mostly theoretical) backends could not be coherent and
this would be an issue.

So maybe the right way is to simply not support the current odd flush
defintion in the kernel then and require a new properly defined flush
version instead.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-02 16:17           ` Alex Bligh
@ 2016-10-03  7:49             ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03  7:49 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Josef Bacik, axboe, linux-block, kernel-team, nbd-general, linux-kernel

On Sun, Oct 02, 2016 at 05:17:14PM +0100, Alex Bligh wrote:
> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
> > Huh I missed that.  Yeah that's not possible for us for sure, I think my option 
> > idea is the less awful way forward if we want to address that limitation.  Thanks,
> 
> I think if the server supports flush (which you can tell), sending flush on
> all channels is the only safe thing to do, without substantial protocol
> changes (which I'm not sure how one would do given flush is in a sense
> a synchronisation point). I think it's thus imperative this gets fixed
> before the change gets merged.

Whoa there, Alex.

I don't think this should be a blocker. There is a theoretical problem
yes, but I believe it to be limited to the case where the client and the
server are not in the same broadcast domain, which is not the common
case (most NBD connections run either over the localhost iface, or to a
machine nearby). In the case where the client and server are on the same
LAN, random packet drop is highly unlikely, so TCP communication will
not be delayed and so the replies will, with high certainty, arrive in
the same order that they were sent.

Obviously the documentation for the "spawn multiple connections" option
in nbd-client needs to clearly state that it will decrease reliability
in this edge case, but I don't think that blocking this feature until a
solution for this problem is implemented is the right way forward. There
are valid use cases where using multiple connections is preferable, even
with the current state of affairs, and they do not all involve "switch
off flush".

Regards,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03  7:49             ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03  7:49 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Josef Bacik, axboe, linux-block, kernel-team, nbd-general, linux-kernel

On Sun, Oct 02, 2016 at 05:17:14PM +0100, Alex Bligh wrote:
> On 29 Sep 2016, at 17:59, Josef Bacik <jbacik@fb.com> wrote:
> > Huh I missed that.  Yeah that's not possible for us for sure, I think my option 
> > idea is the less awful way forward if we want to address that limitation.  Thanks,
> 
> I think if the server supports flush (which you can tell), sending flush on
> all channels is the only safe thing to do, without substantial protocol
> changes (which I'm not sure how one would do given flush is in a sense
> a synchronisation point). I think it's thus imperative this gets fixed
> before the change gets merged.

Whoa there, Alex.

I don't think this should be a blocker. There is a theoretical problem
yes, but I believe it to be limited to the case where the client and the
server are not in the same broadcast domain, which is not the common
case (most NBD connections run either over the localhost iface, or to a
machine nearby). In the case where the client and server are on the same
LAN, random packet drop is highly unlikely, so TCP communication will
not be delayed and so the replies will, with high certainty, arrive in
the same order that they were sent.

Obviously the documentation for the "spawn multiple connections" option
in nbd-client needs to clearly state that it will decrease reliability
in this edge case, but I don't think that blocking this feature until a
solution for this problem is implemented is the right way forward. There
are valid use cases where using multiple connections is preferable, even
with the current state of affairs, and they do not all involve "switch
off flush".

Regards,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03  7:20               ` Christoph Hellwig
@ 2016-10-03  7:51                 ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03  7:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Josef Bacik, Alex Bligh, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 12:20:49AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 03, 2016 at 01:47:06AM +0000, Josef Bacik wrote:
> > It's not "broken", it's working as designed, and any fs on top of this
> > patch will be perfectly safe because they all wait for their io to complete
> > before issuing the FLUSH.  If somebody wants to address the paranoid case
> > later then all the power to them, but this works for my use case and isn't
> > inherently broken.  If it doesn't work for yours then don't use the
> > feature, it's that simple.  Thanks,
> 
> Let's take one step back here.  I agree with Josef that sending
> one single flush is perfectly fine for all usual cases.  The issue
> that was brought up last time we had this discussion was that some
> (I think mostly theoretical) backends could not be coherent and
> this would be an issue.

Actually, I was pointing out the TCP head-of-line issue, where a delay
on the socket that contains the flush reply would result in the arrival
in the kernel block layer of a write reply before the said flush reply,
resulting in a write being considered part of the flush when in fact it
was not.

This is an edge case, and one highly unlikely to result in problems in
the common case (as per my other mail), but it is something to consider.

> So maybe the right way is to simply not support the current odd flush
> defintion in the kernel then and require a new properly defined flush
> version instead.

Can you clarify what you mean by that? Why is it an "odd flush
definition", and how would you "properly" define it?

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03  7:51                 ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03  7:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Josef Bacik, Alex Bligh, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 12:20:49AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 03, 2016 at 01:47:06AM +0000, Josef Bacik wrote:
> > It's not "broken", it's working as designed, and any fs on top of this
> > patch will be perfectly safe because they all wait for their io to complete
> > before issuing the FLUSH.  If somebody wants to address the paranoid case
> > later then all the power to them, but this works for my use case and isn't
> > inherently broken.  If it doesn't work for yours then don't use the
> > feature, it's that simple.  Thanks,
> 
> Let's take one step back here.  I agree with Josef that sending
> one single flush is perfectly fine for all usual cases.  The issue
> that was brought up last time we had this discussion was that some
> (I think mostly theoretical) backends could not be coherent and
> this would be an issue.

Actually, I was pointing out the TCP head-of-line issue, where a delay
on the socket that contains the flush reply would result in the arrival
in the kernel block layer of a write reply before the said flush reply,
resulting in a write being considered part of the flush when in fact it
was not.

This is an edge case, and one highly unlikely to result in problems in
the common case (as per my other mail), but it is something to consider.

> So maybe the right way is to simply not support the current odd flush
> defintion in the kernel then and require a new properly defined flush
> version instead.

Can you clarify what you mean by that? Why is it an "odd flush
definition", and how would you "properly" define it?

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03  7:51                 ` Wouter Verhelst
@ 2016-10-03  7:57                   ` Christoph Hellwig
  -1 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-03  7:57 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, Josef Bacik, Alex Bligh, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 09:51:49AM +0200, Wouter Verhelst wrote:
> Actually, I was pointing out the TCP head-of-line issue, where a delay
> on the socket that contains the flush reply would result in the arrival
> in the kernel block layer of a write reply before the said flush reply,
> resulting in a write being considered part of the flush when in fact it
> was not.

The kernel (or any other user of SCSI/ATA/NVMe-like cache flushes)
will wait for all I/O that needs to be in the cache for explicitly,
so this is not a problem.

> Can you clarify what you mean by that? Why is it an "odd flush
> definition", and how would you "properly" define it?

E.g. take the defintion from NVMe which also supports multiple queues:

"The Flush command shall commit data and metadata associated with the
specified namespace(s) to non-volatile media. The flush applies to all
commands completed prior to the submission of the Flush command.
The controller may also flush additional data and/or metadata from any
namespace."

The focus is completed - we need to get a reply to the host first
before we can send the flush command, so anything that we require
to be flushed needs to explicitly be completed first.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03  7:57                   ` Christoph Hellwig
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-03  7:57 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, Josef Bacik, Alex Bligh, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel

On Mon, Oct 03, 2016 at 09:51:49AM +0200, Wouter Verhelst wrote:
> Actually, I was pointing out the TCP head-of-line issue, where a delay
> on the socket that contains the flush reply would result in the arrival
> in the kernel block layer of a write reply before the said flush reply,
> resulting in a write being considered part of the flush when in fact it
> was not.

The kernel (or any other user of SCSI/ATA/NVMe-like cache flushes)
will wait for all I/O that needs to be in the cache for explicitly,
so this is not a problem.

> Can you clarify what you mean by that? Why is it an "odd flush
> definition", and how would you "properly" define it?

E.g. take the defintion from NVMe which also supports multiple queues:

"The Flush command shall commit data and metadata associated with the
specified namespace(s) to non-volatile media. The flush applies to all
commands completed prior to the submission of the Flush command.
The controller may also flush additional data and/or metadata from any
namespace."

The focus is completed - we need to get a reply to the host first
before we can send the flush command, so anything that we require
to be flushed needs to explicitly be completed first.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03  7:57                   ` Christoph Hellwig
@ 2016-10-03 11:34                     ` Alex Bligh
  -1 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-03 11:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel


> On 3 Oct 2016, at 08:57, Christoph Hellwig <hch@infradead.org> wrote:
>=20
>> Can you clarify what you mean by that? Why is it an "odd flush
>> definition", and how would you "properly" define it?
>=20
> E.g. take the defintion from NVMe which also supports multiple queues:
>=20
> "The Flush command shall commit data and metadata associated with the
> specified namespace(s) to non-volatile media. The flush applies to all
> commands completed prior to the submission of the Flush command.
> The controller may also flush additional data and/or metadata from any
> namespace."
>=20
> The focus is completed - we need to get a reply to the host first
> before we can send the flush command, so anything that we require
> to be flushed needs to explicitly be completed first.

I think there are two separate issues here:

a) What's described as the "HOL blocking issue".=20

This comes down to what Wouter said here:

> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
> when a FLUSH is sent over one channel, and the reply comes in, that =
all
> commands which have been received, regardless of which channel they =
were
> received over, have reched disk.
>=20
> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>=20
> It is impossible for nbd to make such a guarantee, due to head-of-line
> blocking on TCP.

this is perfectly accurate as far as it goes, but this isn't the current
NBD definition of 'flush'.

That is (from the docs):

> All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) =
that the server completes (i.e. replies to) prior to processing to a =
NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying =
to thatNBD_CMD_FLUSH. This paragraph only applies if NBD_FLAG_SEND_FLUSH =
is set within the transmission flags, as otherwise NBD_CMD_FLUSH will =
never be sent by the client to the server.

I don't think HOL blocking is an issue here by that definition, because =
all FLUSH requires is that commands that are actually completed are =
flushed to disk. If there is head of line blocking which delays the =
arrival of a write issued before a flush, then the sender cannot be =
relying on whether that write is actually completed or not (or it would =
have waited for the result). The flush requires only that those commands =
COMPLETED are flushed to disk, not that those commands RECEIVED have =
been flushed to disk (and a fortiori not that those commands SENT FIRST) =
have been flushed to disk. =46rom the point of view of the client, the =
flush can therefore only guarantee that the data associated with those =
commands for which it's actually received a reply prior to issuing the =
flush will be flushed, because the replies can be disordered too.

I don't think there is actually a problem here - Wouter if I'm wrong =
about this, I'd like to understand your argument better.



b) What I'm describing - which is the lack of synchronisation between =
channels.

Suppose you have a simple forking NBD server which uses (e.g.) a Ceph =
backend. Each process (i.e. each NBD channel) will have a separate =
connection to something with its own cache and buffering. Issuing a =
flush in Ceph requires waiting until a quorum of backends (OSDs) has =
been written to, and with a number of backends substantially greater =
than the quorum, it is not unlikely that a flush on one channel will not =
wait for writes on what Ceph considers a completely independent channel =
to have fully written (assuming the write completes before the flush is =
done).

The same would happen pretty trivially with a forking server that uses a =
process-space write-back cache.

This is because the spec when the spec says: "All write commands (that =
includes NBD_CMD_WRITE, and NBD_CMD_TRIM) that the server completes =
(i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written =
to non-volatile storage prior to replying to that NBD_CMD_FLUSH." what =
it currently means is actually "All write commands (that includes =
NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH THAT CLIENT*** that =
the server completes (i.e. replies to) prior to processing to a =
NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying =
to that NBD_CMD_FLUSH".=20

So what we would need the spec to mean is "All write commands (that =
includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CHANNEL =
OF THAT CLIENT*** that the server completes (i.e. replies to) prior to =
processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage =
prior to replying to that NBD_CMD_FLUSH". And as we have no way to =
associate different channels of the same client, for servers that can't =
rely on the OS to synchronise flushing across different clients relating =
to the same file, in practice that means "All write commands (that =
includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CLIENT =
AT ALL*** that the server completes (i.e. replies to) prior to =
processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage =
prior to replying to that NBD_CMD_FLUSH" - i.e. a flush on any channel =
of any client must flush every channel of every client, because we have =
no easy way to tell which clients are in fact two channels. I have =
concerns over the scalability of this.

Now, in the reference server, NBD_CMD_FLUSH is implemented through an =
fdatasync(). Each client (and therefore each channel) runs in a =
different process.

Earlier in this thread, someone suggested that if this happens:

      Process A                Process B
      =3D=3D=3D=3D=3D=3D=3D=3D=3D                =3D=3D=3D=3D=3D=3D=3D=3D=3D=


      fd1=3Dopen("file123")
                               fd2=3Dopen("file123")

      write(fd1, ...)
                               fdatasync("fd2")

then the fdatasync() is guaranteed to sync the write that Process A has =
written. This may or may not be the case under Linux (wiser minds than =
me will know). Is it guaranteed to be the case with (e.g.) the file on =
NFS? On all POSIX platforms?

Looking at =
http://pubs.opengroup.org/onlinepubs/009695399/functions/fdatasync.html =
I'd say it was a little ambiguous as to whether it will ALWAYS flush all =
data associated with the file even if it is being written by a different =
process (and a different FD).

If fdatasync() is always guaranteed to flush data associated with writes =
by a different process (and separately opened fd), then as it happens =
there won't be a problem on the reference server, just on servers that =
don't happen to use fdatasync() or similar to implement flushes, and =
which don't maintain their own caches. If fdatasync() is not so =
guaranteed, we have a problem with the reference server too, at least on =
some platforms and fling systems.

What I'm therefore asking for is either:
a) that the server can say 'if you are multichannel, you will need to =
send flush on each channel' (best); OR
b) that the server can say 'don't go multichannel'

as part of the negotiation stage. Of course as this is dependent on the =
backend, this is going to be something that is per-target (i.e. needs to =
come as a transmission flag or similar).

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03 11:34                     ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-03 11:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alex Bligh, Wouter Verhelst, Josef Bacik, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel


> On 3 Oct 2016, at 08:57, Christoph Hellwig <hch@infradead.org> wrote:
> 
>> Can you clarify what you mean by that? Why is it an "odd flush
>> definition", and how would you "properly" define it?
> 
> E.g. take the defintion from NVMe which also supports multiple queues:
> 
> "The Flush command shall commit data and metadata associated with the
> specified namespace(s) to non-volatile media. The flush applies to all
> commands completed prior to the submission of the Flush command.
> The controller may also flush additional data and/or metadata from any
> namespace."
> 
> The focus is completed - we need to get a reply to the host first
> before we can send the flush command, so anything that we require
> to be flushed needs to explicitly be completed first.

I think there are two separate issues here:

a) What's described as the "HOL blocking issue". 

This comes down to what Wouter said here:

> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
> when a FLUSH is sent over one channel, and the reply comes in, that all
> commands which have been received, regardless of which channel they were
> received over, have reched disk.
> 
> [1] Message-ID: <20160915122304.GA15501@infradead.org>
> 
> It is impossible for nbd to make such a guarantee, due to head-of-line
> blocking on TCP.

this is perfectly accurate as far as it goes, but this isn't the current
NBD definition of 'flush'.

That is (from the docs):

> All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to thatNBD_CMD_FLUSH. This paragraph only applies if NBD_FLAG_SEND_FLUSH is set within the transmission flags, as otherwise NBD_CMD_FLUSH will never be sent by the client to the server.

I don't think HOL blocking is an issue here by that definition, because all FLUSH requires is that commands that are actually completed are flushed to disk. If there is head of line blocking which delays the arrival of a write issued before a flush, then the sender cannot be relying on whether that write is actually completed or not (or it would have waited for the result). The flush requires only that those commands COMPLETED are flushed to disk, not that those commands RECEIVED have been flushed to disk (and a fortiori not that those commands SENT FIRST) have been flushed to disk. From the point of view of the client, the flush can therefore only guarantee that the data associated with those commands for which it's actually received a reply prior to issuing the flush will be flushed, because the replies can be disordered too.

I don't think there is actually a problem here - Wouter if I'm wrong about this, I'd like to understand your argument better.



b) What I'm describing - which is the lack of synchronisation between channels.

Suppose you have a simple forking NBD server which uses (e.g.) a Ceph backend. Each process (i.e. each NBD channel) will have a separate connection to something with its own cache and buffering. Issuing a flush in Ceph requires waiting until a quorum of backends (OSDs) has been written to, and with a number of backends substantially greater than the quorum, it is not unlikely that a flush on one channel will not wait for writes on what Ceph considers a completely independent channel to have fully written (assuming the write completes before the flush is done).

The same would happen pretty trivially with a forking server that uses a process-space write-back cache.

This is because the spec when the spec says: "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH." what it currently means is actually "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH THAT CLIENT*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH". 

So what we would need the spec to mean is "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CHANNEL OF THAT CLIENT*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH". And as we have no way to associate different channels of the same client, for servers that can't rely on the OS to synchronise flushing across different clients relating to the same file, in practice that means "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CLIENT AT ALL*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH" - i.e. a flush on any channel of any client must flush every channel of every client, because we have no easy way to tell which clients are in fact two channels. I have concerns over the scalability of this.

Now, in the reference server, NBD_CMD_FLUSH is implemented through an fdatasync(). Each client (and therefore each channel) runs in a different process.

Earlier in this thread, someone suggested that if this happens:

      Process A                Process B
      =========                =========

      fd1=open("file123")
                               fd2=open("file123")

      write(fd1, ...)
                               fdatasync("fd2")

then the fdatasync() is guaranteed to sync the write that Process A has written. This may or may not be the case under Linux (wiser minds than me will know). Is it guaranteed to be the case with (e.g.) the file on NFS? On all POSIX platforms?

Looking at http://pubs.opengroup.org/onlinepubs/009695399/functions/fdatasync.html I'd say it was a little ambiguous as to whether it will ALWAYS flush all data associated with the file even if it is being written by a different process (and a different FD).

If fdatasync() is always guaranteed to flush data associated with writes by a different process (and separately opened fd), then as it happens there won't be a problem on the reference server, just on servers that don't happen to use fdatasync() or similar to implement flushes, and which don't maintain their own caches. If fdatasync() is not so guaranteed, we have a problem with the reference server too, at least on some platforms and fling systems.

What I'm therefore asking for is either:
a) that the server can say 'if you are multichannel, you will need to send flush on each channel' (best); OR
b) that the server can say 'don't go multichannel'

as part of the negotiation stage. Of course as this is dependent on the backend, this is going to be something that is per-target (i.e. needs to come as a transmission flag or similar).

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03 11:34                     ` Alex Bligh
@ 2016-10-03 14:32                       ` Josef Bacik
  -1 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2016-10-03 14:32 UTC (permalink / raw)
  To: Alex Bligh, Christoph Hellwig
  Cc: Wouter Verhelst, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel

T24gMTAvMDMvMjAxNiAwNzozNCBBTSwgQWxleCBCbGlnaCB3cm90ZToNCj4NCj4+IE9uIDMg
T2N0IDIwMTYsIGF0IDA4OjU3LCBDaHJpc3RvcGggSGVsbHdpZyA8aGNoQGluZnJhZGVhZC5v
cmc+IHdyb3RlOg0KPj4NCj4+PiBDYW4geW91IGNsYXJpZnkgd2hhdCB5b3UgbWVhbiBieSB0
aGF0PyBXaHkgaXMgaXQgYW4gIm9kZCBmbHVzaA0KPj4+IGRlZmluaXRpb24iLCBhbmQgaG93
IHdvdWxkIHlvdSAicHJvcGVybHkiIGRlZmluZSBpdD8NCj4+DQo+PiBFLmcuIHRha2UgdGhl
IGRlZmludGlvbiBmcm9tIE5WTWUgd2hpY2ggYWxzbyBzdXBwb3J0cyBtdWx0aXBsZSBxdWV1
ZXM6DQo+Pg0KPj4gIlRoZSBGbHVzaCBjb21tYW5kIHNoYWxsIGNvbW1pdCBkYXRhIGFuZCBt
ZXRhZGF0YSBhc3NvY2lhdGVkIHdpdGggdGhlDQo+PiBzcGVjaWZpZWQgbmFtZXNwYWNlKHMp
IHRvIG5vbi12b2xhdGlsZSBtZWRpYS4gVGhlIGZsdXNoIGFwcGxpZXMgdG8gYWxsDQo+PiBj
b21tYW5kcyBjb21wbGV0ZWQgcHJpb3IgdG8gdGhlIHN1Ym1pc3Npb24gb2YgdGhlIEZsdXNo
IGNvbW1hbmQuDQo+PiBUaGUgY29udHJvbGxlciBtYXkgYWxzbyBmbHVzaCBhZGRpdGlvbmFs
IGRhdGEgYW5kL29yIG1ldGFkYXRhIGZyb20gYW55DQo+PiBuYW1lc3BhY2UuIg0KPj4NCj4+
IFRoZSBmb2N1cyBpcyBjb21wbGV0ZWQgLSB3ZSBuZWVkIHRvIGdldCBhIHJlcGx5IHRvIHRo
ZSBob3N0IGZpcnN0DQo+PiBiZWZvcmUgd2UgY2FuIHNlbmQgdGhlIGZsdXNoIGNvbW1hbmQs
IHNvIGFueXRoaW5nIHRoYXQgd2UgcmVxdWlyZQ0KPj4gdG8gYmUgZmx1c2hlZCBuZWVkcyB0
byBleHBsaWNpdGx5IGJlIGNvbXBsZXRlZCBmaXJzdC4NCj4NCj4gSSB0aGluayB0aGVyZSBh
cmUgdHdvIHNlcGFyYXRlIGlzc3VlcyBoZXJlOg0KPg0KPiBhKSBXaGF0J3MgZGVzY3JpYmVk
IGFzIHRoZSAiSE9MIGJsb2NraW5nIGlzc3VlIi4NCj4NCj4gVGhpcyBjb21lcyBkb3duIHRv
IHdoYXQgV291dGVyIHNhaWQgaGVyZToNCj4NCj4+IFdlbGwsIHdoZW4gSSBhc2tlZCBlYXJs
aWVyLCBDaHJpc3RvcGggc2FpZFsxXSB0aGF0IGJsay1tcSBhc3N1bWVzIHRoYXQNCj4+IHdo
ZW4gYSBGTFVTSCBpcyBzZW50IG92ZXIgb25lIGNoYW5uZWwsIGFuZCB0aGUgcmVwbHkgY29t
ZXMgaW4sIHRoYXQgYWxsDQo+PiBjb21tYW5kcyB3aGljaCBoYXZlIGJlZW4gcmVjZWl2ZWQs
IHJlZ2FyZGxlc3Mgb2Ygd2hpY2ggY2hhbm5lbCB0aGV5IHdlcmUNCj4+IHJlY2VpdmVkIG92
ZXIsIGhhdmUgcmVjaGVkIGRpc2suDQo+Pg0KPj4gWzFdIE1lc3NhZ2UtSUQ6IDwyMDE2MDkx
NTEyMjMwNC5HQTE1NTAxQGluZnJhZGVhZC5vcmc+DQo+Pg0KPj4gSXQgaXMgaW1wb3NzaWJs
ZSBmb3IgbmJkIHRvIG1ha2Ugc3VjaCBhIGd1YXJhbnRlZSwgZHVlIHRvIGhlYWQtb2YtbGlu
ZQ0KPj4gYmxvY2tpbmcgb24gVENQLg0KPg0KPiB0aGlzIGlzIHBlcmZlY3RseSBhY2N1cmF0
ZSBhcyBmYXIgYXMgaXQgZ29lcywgYnV0IHRoaXMgaXNuJ3QgdGhlIGN1cnJlbnQNCj4gTkJE
IGRlZmluaXRpb24gb2YgJ2ZsdXNoJy4NCj4NCj4gVGhhdCBpcyAoZnJvbSB0aGUgZG9jcyk6
DQo+DQo+PiBBbGwgd3JpdGUgY29tbWFuZHMgKHRoYXQgaW5jbHVkZXMgTkJEX0NNRF9XUklU
RSwgYW5kIE5CRF9DTURfVFJJTSkgdGhhdCB0aGUgc2VydmVyIGNvbXBsZXRlcyAoaS5lLiBy
ZXBsaWVzIHRvKSBwcmlvciB0byBwcm9jZXNzaW5nIHRvIGEgTkJEX0NNRF9GTFVTSCBNVVNU
IGJlIHdyaXR0ZW4gdG8gbm9uLXZvbGF0aWxlIHN0b3JhZ2UgcHJpb3IgdG8gcmVwbHlpbmcg
dG8gdGhhdE5CRF9DTURfRkxVU0guIFRoaXMgcGFyYWdyYXBoIG9ubHkgYXBwbGllcyBpZiBO
QkRfRkxBR19TRU5EX0ZMVVNIIGlzIHNldCB3aXRoaW4gdGhlIHRyYW5zbWlzc2lvbiBmbGFn
cywgYXMgb3RoZXJ3aXNlIE5CRF9DTURfRkxVU0ggd2lsbCBuZXZlciBiZSBzZW50IGJ5IHRo
ZSBjbGllbnQgdG8gdGhlIHNlcnZlci4NCj4NCj4gSSBkb24ndCB0aGluayBIT0wgYmxvY2tp
bmcgaXMgYW4gaXNzdWUgaGVyZSBieSB0aGF0IGRlZmluaXRpb24sIGJlY2F1c2UgYWxsIEZM
VVNIIHJlcXVpcmVzIGlzIHRoYXQgY29tbWFuZHMgdGhhdCBhcmUgYWN0dWFsbHkgY29tcGxl
dGVkIGFyZSBmbHVzaGVkIHRvIGRpc2suIElmIHRoZXJlIGlzIGhlYWQgb2YgbGluZSBibG9j
a2luZyB3aGljaCBkZWxheXMgdGhlIGFycml2YWwgb2YgYSB3cml0ZSBpc3N1ZWQgYmVmb3Jl
IGEgZmx1c2gsIHRoZW4gdGhlIHNlbmRlciBjYW5ub3QgYmUgcmVseWluZyBvbiB3aGV0aGVy
IHRoYXQgd3JpdGUgaXMgYWN0dWFsbHkgY29tcGxldGVkIG9yIG5vdCAob3IgaXQgd291bGQg
aGF2ZSB3YWl0ZWQgZm9yIHRoZSByZXN1bHQpLiBUaGUgZmx1c2ggcmVxdWlyZXMgb25seSB0
aGF0IHRob3NlIGNvbW1hbmRzIENPTVBMRVRFRCBhcmUgZmx1c2hlZCB0byBkaXNrLCBub3Qg
dGhhdCB0aG9zZSBjb21tYW5kcyBSRUNFSVZFRCBoYXZlIGJlZW4gZmx1c2hlZCB0byBkaXNr
IChhbmQgYSBmb3J0aW9yaSBub3QgdGhhdCB0aG9zZSBjb21tYW5kcyBTRU5UIEZJUlNUKSBo
YXZlIGJlZW4gZmx1c2hlZCB0byBkaXNrLiBGcm9tIHRoZSBwb2ludCBvZiB2aWV3IG9mIHRo
ZSBjbGllbnQsIHRoZSBmbHVzaCBjYW4gdGhlcmVmb3JlIG9ubHkgZ3VhcmFudGVlIHRoYXQg
dGhlIGRhdGEgYXNzb2NpYXRlZCB3aXRoIHRob3NlIGNvbW1hbmRzIGZvciB3aGljaCBpdCdz
IGFjdHVhbGx5IHJlY2VpdmVkIGEgcmVwbHkgcHJpb3IgdG8gaXNzdWluZyB0aGUgZmx1c2gg
d2lsbCBiZSBmbHVzaGVkLCBiZWNhdXNlIHRoZSByZXBsaWVzIGNhbiBiZSBkaXNvcmRlcmVk
IHRvby4NCj4NCj4gSSBkb24ndCB0aGluayB0aGVyZSBpcyBhY3R1YWxseSBhIHByb2JsZW0g
aGVyZSAtIFdvdXRlciBpZiBJJ20gd3JvbmcgYWJvdXQgdGhpcywgSSdkIGxpa2UgdG8gdW5k
ZXJzdGFuZCB5b3VyIGFyZ3VtZW50IGJldHRlci4NCj4NCj4NCj4NCj4gYikgV2hhdCBJJ20g
ZGVzY3JpYmluZyAtIHdoaWNoIGlzIHRoZSBsYWNrIG9mIHN5bmNocm9uaXNhdGlvbiBiZXR3
ZWVuIGNoYW5uZWxzLg0KPg0KPiBTdXBwb3NlIHlvdSBoYXZlIGEgc2ltcGxlIGZvcmtpbmcg
TkJEIHNlcnZlciB3aGljaCB1c2VzIChlLmcuKSBhIENlcGggYmFja2VuZC4gRWFjaCBwcm9j
ZXNzIChpLmUuIGVhY2ggTkJEIGNoYW5uZWwpIHdpbGwgaGF2ZSBhIHNlcGFyYXRlIGNvbm5l
Y3Rpb24gdG8gc29tZXRoaW5nIHdpdGggaXRzIG93biBjYWNoZSBhbmQgYnVmZmVyaW5nLiBJ
c3N1aW5nIGEgZmx1c2ggaW4gQ2VwaCByZXF1aXJlcyB3YWl0aW5nIHVudGlsIGEgcXVvcnVt
IG9mIGJhY2tlbmRzIChPU0RzKSBoYXMgYmVlbiB3cml0dGVuIHRvLCBhbmQgd2l0aCBhIG51
bWJlciBvZiBiYWNrZW5kcyBzdWJzdGFudGlhbGx5IGdyZWF0ZXIgdGhhbiB0aGUgcXVvcnVt
LCBpdCBpcyBub3QgdW5saWtlbHkgdGhhdCBhIGZsdXNoIG9uIG9uZSBjaGFubmVsIHdpbGwg
bm90IHdhaXQgZm9yIHdyaXRlcyBvbiB3aGF0IENlcGggY29uc2lkZXJzIGEgY29tcGxldGVs
eSBpbmRlcGVuZGVudCBjaGFubmVsIHRvIGhhdmUgZnVsbHkgd3JpdHRlbiAoYXNzdW1pbmcg
dGhlIHdyaXRlIGNvbXBsZXRlcyBiZWZvcmUgdGhlIGZsdXNoIGlzIGRvbmUpLg0KPg0KPiBU
aGUgc2FtZSB3b3VsZCBoYXBwZW4gcHJldHR5IHRyaXZpYWxseSB3aXRoIGEgZm9ya2luZyBz
ZXJ2ZXIgdGhhdCB1c2VzIGEgcHJvY2Vzcy1zcGFjZSB3cml0ZS1iYWNrIGNhY2hlLg0KPg0K
PiBUaGlzIGlzIGJlY2F1c2UgdGhlIHNwZWMgd2hlbiB0aGUgc3BlYyBzYXlzOiAiQWxsIHdy
aXRlIGNvbW1hbmRzICh0aGF0IGluY2x1ZGVzIE5CRF9DTURfV1JJVEUsIGFuZCBOQkRfQ01E
X1RSSU0pIHRoYXQgdGhlIHNlcnZlciBjb21wbGV0ZXMgKGkuZS4gcmVwbGllcyB0bykgcHJp
b3IgdG8gcHJvY2Vzc2luZyB0byBhIE5CRF9DTURfRkxVU0ggTVVTVCBiZSB3cml0dGVuIHRv
IG5vbi12b2xhdGlsZSBzdG9yYWdlIHByaW9yIHRvIHJlcGx5aW5nIHRvIHRoYXQgTkJEX0NN
RF9GTFVTSC4iIHdoYXQgaXQgY3VycmVudGx5IG1lYW5zIGlzIGFjdHVhbGx5ICJBbGwgd3Jp
dGUgY29tbWFuZHMgKHRoYXQgaW5jbHVkZXMgTkJEX0NNRF9XUklURSwgYW5kIE5CRF9DTURf
VFJJTSkgKioqQVNTT0NJQVRFRCBXSVRIIFRIQVQgQ0xJRU5UKioqIHRoYXQgdGhlIHNlcnZl
ciBjb21wbGV0ZXMgKGkuZS4gcmVwbGllcyB0bykgcHJpb3IgdG8gcHJvY2Vzc2luZyB0byBh
IE5CRF9DTURfRkxVU0ggTVVTVCBiZSB3cml0dGVuIHRvIG5vbi12b2xhdGlsZSBzdG9yYWdl
IHByaW9yIHRvIHJlcGx5aW5nIHRvIHRoYXQgTkJEX0NNRF9GTFVTSCIuDQo+DQo+IFNvIHdo
YXQgd2Ugd291bGQgbmVlZCB0aGUgc3BlYyB0byBtZWFuIGlzICJBbGwgd3JpdGUgY29tbWFu
ZHMgKHRoYXQgaW5jbHVkZXMgTkJEX0NNRF9XUklURSwgYW5kIE5CRF9DTURfVFJJTSkgKioq
QVNTT0NJQVRFRCBXSVRIIEFOWSBDSEFOTkVMIE9GIFRIQVQgQ0xJRU5UKioqIHRoYXQgdGhl
IHNlcnZlciBjb21wbGV0ZXMgKGkuZS4gcmVwbGllcyB0bykgcHJpb3IgdG8gcHJvY2Vzc2lu
ZyB0byBhIE5CRF9DTURfRkxVU0ggTVVTVCBiZSB3cml0dGVuIHRvIG5vbi12b2xhdGlsZSBz
dG9yYWdlIHByaW9yIHRvIHJlcGx5aW5nIHRvIHRoYXQgTkJEX0NNRF9GTFVTSCIuIEFuZCBh
cyB3ZSBoYXZlIG5vIHdheSB0byBhc3NvY2lhdGUgZGlmZmVyZW50IGNoYW5uZWxzIG9mIHRo
ZSBzYW1lIGNsaWVudCwgZm9yIHNlcnZlcnMgdGhhdCBjYW4ndCByZWx5IG9uIHRoZSBPUyB0
byBzeW5jaHJvbmlzZSBmbHVzaGluZyBhY3Jvc3MgZGlmZmVyZW50IGNsaWVudHMgcmVsYXRp
bmcgdG8gdGhlIHNhbWUgZmlsZSwgaW4gcHJhY3RpY2UgdGhhdCBtZWFucyAiQWxsIHdyaXRl
IGNvbW1hbmRzICh0aGF0IGluY2x1ZGVzIE5CRF9DTURfV1JJVEUsIGFuZCBOQkRfQ01EX1RS
SU0pICoqKkFTU09DSUFURUQgV0lUSCBBTlkgQ0xJRU5UIEFUIEFMTCoqKiB0aGF0IHRoZSBz
ZXJ2ZXIgY29tcGxldGVzIChpLmUuIHJlcGxpZXMgdG8pIHByaW9yIHRvIHByb2Nlc3Npbmcg
dG8gYSBOQkRfQ01EX0ZMVVNIIE1VU1QgYmUgd3JpdHRlbiB0byBub24tdm9sYXRpbGUgc3Rv
cmFnZSBwcmlvciB0byByZXBseWluZyB0byB0aGF0IE5CRF9DTURfRkxVU0giIC0gaS5lLiBh
IGZsdXNoIG9uIGFueSBjaGFubmVsIG9mIGFueSBjbGllbnQgbXVzdCBmbHVzaCBldmVyeSBj
aGFubmVsIG9mIGV2ZXJ5IGNsaWVudCwgYmVjYXVzZSB3ZSBoYXZlIG5vIGVhc3kgd2F5IHRv
IHRlbGwgd2hpY2ggY2xpZW50cyBhcmUgaW4gZmFjdCB0d28gY2hhbm5lbHMuIEkgaGF2ZSBj
b25jZXJucyBvdmVyIHRoZSBzY2FsYWJpbGl0eSBvZiB0aGlzLg0KPg0KPiBOb3csIGluIHRo
ZSByZWZlcmVuY2Ugc2VydmVyLCBOQkRfQ01EX0ZMVVNIIGlzIGltcGxlbWVudGVkIHRocm91
Z2ggYW4gZmRhdGFzeW5jKCkuIEVhY2ggY2xpZW50IChhbmQgdGhlcmVmb3JlIGVhY2ggY2hh
bm5lbCkgcnVucyBpbiBhIGRpZmZlcmVudCBwcm9jZXNzLg0KPg0KPiBFYXJsaWVyIGluIHRo
aXMgdGhyZWFkLCBzb21lb25lIHN1Z2dlc3RlZCB0aGF0IGlmIHRoaXMgaGFwcGVuczoNCj4N
Cj4gICAgICAgUHJvY2VzcyBBICAgICAgICAgICAgICAgIFByb2Nlc3MgQg0KPiAgICAgICA9
PT09PT09PT0gICAgICAgICAgICAgICAgPT09PT09PT09DQo+DQo+ICAgICAgIGZkMT1vcGVu
KCJmaWxlMTIzIikNCj4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGZkMj1vcGVu
KCJmaWxlMTIzIikNCj4NCj4gICAgICAgd3JpdGUoZmQxLCAuLi4pDQo+ICAgICAgICAgICAg
ICAgICAgICAgICAgICAgICAgICBmZGF0YXN5bmMoImZkMiIpDQo+DQo+IHRoZW4gdGhlIGZk
YXRhc3luYygpIGlzIGd1YXJhbnRlZWQgdG8gc3luYyB0aGUgd3JpdGUgdGhhdCBQcm9jZXNz
IEEgaGFzIHdyaXR0ZW4uIFRoaXMgbWF5IG9yIG1heSBub3QgYmUgdGhlIGNhc2UgdW5kZXIg
TGludXggKHdpc2VyIG1pbmRzIHRoYW4gbWUgd2lsbCBrbm93KS4gSXMgaXQgZ3VhcmFudGVl
ZCB0byBiZSB0aGUgY2FzZSB3aXRoIChlLmcuKSB0aGUgZmlsZSBvbiBORlM/IE9uIGFsbCBQ
T1NJWCBwbGF0Zm9ybXM/DQo+DQo+IExvb2tpbmcgYXQgaHR0cHM6Ly91cmxkZWZlbnNlLnBy
b29mcG9pbnQuY29tL3YyL3VybD91PWh0dHAtM0FfX3B1YnMub3Blbmdyb3VwLm9yZ19vbmxp
bmVwdWJzXzAwOTY5NTM5OV9mdW5jdGlvbnNfZmRhdGFzeW5jLmh0bWwmZD1EUUlGQWcmYz01
VkQwUlR0TmxUaDN5Y2Q0MWIzTVV3JnI9c0R6ZzZNdkh5bUtPVWdJOFNGSW00USZtPUNyS1Rz
d1o1Zno1dGR0WnZBOXJlclpIWGI4TzhPNTdMU09qTkpOMWVqbXMmcz1ia09wajY0bUhONjBK
WGFwSjYyR0plMFF0enAtWld3Vm45MWtYbUoyNDdNJmU9ICBJJ2Qgc2F5IGl0IHdhcyBhIGxp
dHRsZSBhbWJpZ3VvdXMgYXMgdG8gd2hldGhlciBpdCB3aWxsIEFMV0FZUyBmbHVzaCBhbGwg
ZGF0YSBhc3NvY2lhdGVkIHdpdGggdGhlIGZpbGUgZXZlbiBpZiBpdCBpcyBiZWluZyB3cml0
dGVuIGJ5IGEgZGlmZmVyZW50IHByb2Nlc3MgKGFuZCBhIGRpZmZlcmVudCBGRCkuDQo+DQo+
IElmIGZkYXRhc3luYygpIGlzIGFsd2F5cyBndWFyYW50ZWVkIHRvIGZsdXNoIGRhdGEgYXNz
b2NpYXRlZCB3aXRoIHdyaXRlcyBieSBhIGRpZmZlcmVudCBwcm9jZXNzIChhbmQgc2VwYXJh
dGVseSBvcGVuZWQgZmQpLCB0aGVuIGFzIGl0IGhhcHBlbnMgdGhlcmUgd29uJ3QgYmUgYSBw
cm9ibGVtIG9uIHRoZSByZWZlcmVuY2Ugc2VydmVyLCBqdXN0IG9uIHNlcnZlcnMgdGhhdCBk
b24ndCBoYXBwZW4gdG8gdXNlIGZkYXRhc3luYygpIG9yIHNpbWlsYXIgdG8gaW1wbGVtZW50
IGZsdXNoZXMsIGFuZCB3aGljaCBkb24ndCBtYWludGFpbiB0aGVpciBvd24gY2FjaGVzLiBJ
ZiBmZGF0YXN5bmMoKSBpcyBub3Qgc28gZ3VhcmFudGVlZCwgd2UgaGF2ZSBhIHByb2JsZW0g
d2l0aCB0aGUgcmVmZXJlbmNlIHNlcnZlciB0b28sIGF0IGxlYXN0IG9uIHNvbWUgcGxhdGZv
cm1zIGFuZCBmbGluZyBzeXN0ZW1zLg0KPg0KPiBXaGF0IEknbSB0aGVyZWZvcmUgYXNraW5n
IGZvciBpcyBlaXRoZXI6DQo+IGEpIHRoYXQgdGhlIHNlcnZlciBjYW4gc2F5ICdpZiB5b3Ug
YXJlIG11bHRpY2hhbm5lbCwgeW91IHdpbGwgbmVlZCB0byBzZW5kIGZsdXNoIG9uIGVhY2gg
Y2hhbm5lbCcgKGJlc3QpOyBPUg0KPiBiKSB0aGF0IHRoZSBzZXJ2ZXIgY2FuIHNheSAnZG9u
J3QgZ28gbXVsdGljaGFubmVsJw0KPg0KPiBhcyBwYXJ0IG9mIHRoZSBuZWdvdGlhdGlvbiBz
dGFnZS4gT2YgY291cnNlIGFzIHRoaXMgaXMgZGVwZW5kZW50IG9uIHRoZSBiYWNrZW5kLCB0
aGlzIGlzIGdvaW5nIHRvIGJlIHNvbWV0aGluZyB0aGF0IGlzIHBlci10YXJnZXQgKGkuZS4g
bmVlZHMgdG8gY29tZSBhcyBhIHRyYW5zbWlzc2lvbiBmbGFnIG9yIHNpbWlsYXIpLg0KPg0K
DQpPayBJIHVuZGVyc3RhbmQgeW91ciBvYmplY3Rpb25zIG5vdy4gIFlvdSBhcmVuJ3QgYXJn
dWluZyB0aGF0IHdlIGFyZSB1bnNhZmUgYnkgDQpkZWZhdWx0LCBvbmx5IHRoYXQgd2UgYXJl
IHVuc2FmZSB3aXRoIHNlcnZlcnMgdGhhdCBkbyBzb21ldGhpbmcgc3BlY2lhbCBiZXlvbmQg
DQpzaW1wbHkgd3JpdGluZyB0byBhIHNpbmdsZSBkaXNrIG9yIGZpbGUuICBJIGFncmVlIHRo
aXMgaXMgcHJvYmxlbWF0aWMsIGJ1dCB5b3UgDQpzaW1wbHkgZG9uJ3QgdXNlIHRoaXMgZmVh
dHVyZSBpZiB5b3VyIHNlcnZlciBjYW4ndCBkZWFsIHdpdGggaXQgd2VsbC4gIFRoYW5rcywN
Cg0KSm9zZWYNCg==

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03 14:32                       ` Josef Bacik
  0 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2016-10-03 14:32 UTC (permalink / raw)
  To: Alex Bligh, Christoph Hellwig
  Cc: Wouter Verhelst, Jens Axboe, linux-block, Kernel Team,
	nbd-general, linux-kernel

On 10/03/2016 07:34 AM, Alex Bligh wrote:
>
>> On 3 Oct 2016, at 08:57, Christoph Hellwig <hch@infradead.org> wrote:
>>
>>> Can you clarify what you mean by that? Why is it an "odd flush
>>> definition", and how would you "properly" define it?
>>
>> E.g. take the defintion from NVMe which also supports multiple queues:
>>
>> "The Flush command shall commit data and metadata associated with the
>> specified namespace(s) to non-volatile media. The flush applies to all
>> commands completed prior to the submission of the Flush command.
>> The controller may also flush additional data and/or metadata from any
>> namespace."
>>
>> The focus is completed - we need to get a reply to the host first
>> before we can send the flush command, so anything that we require
>> to be flushed needs to explicitly be completed first.
>
> I think there are two separate issues here:
>
> a) What's described as the "HOL blocking issue".
>
> This comes down to what Wouter said here:
>
>> Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
>> when a FLUSH is sent over one channel, and the reply comes in, that all
>> commands which have been received, regardless of which channel they were
>> received over, have reched disk.
>>
>> [1] Message-ID: <20160915122304.GA15501@infradead.org>
>>
>> It is impossible for nbd to make such a guarantee, due to head-of-line
>> blocking on TCP.
>
> this is perfectly accurate as far as it goes, but this isn't the current
> NBD definition of 'flush'.
>
> That is (from the docs):
>
>> All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to thatNBD_CMD_FLUSH. This paragraph only applies if NBD_FLAG_SEND_FLUSH is set within the transmission flags, as otherwise NBD_CMD_FLUSH will never be sent by the client to the server.
>
> I don't think HOL blocking is an issue here by that definition, because all FLUSH requires is that commands that are actually completed are flushed to disk. If there is head of line blocking which delays the arrival of a write issued before a flush, then the sender cannot be relying on whether that write is actually completed or not (or it would have waited for the result). The flush requires only that those commands COMPLETED are flushed to disk, not that those commands RECEIVED have been flushed to disk (and a fortiori not that those commands SENT FIRST) have been flushed to disk. From the point of view of the client, the flush can therefore only guarantee that the data associated with those commands for which it's actually received a reply prior to issuing the flush will be flushed, because the replies can be disordered too.
>
> I don't think there is actually a problem here - Wouter if I'm wrong about this, I'd like to understand your argument better.
>
>
>
> b) What I'm describing - which is the lack of synchronisation between channels.
>
> Suppose you have a simple forking NBD server which uses (e.g.) a Ceph backend. Each process (i.e. each NBD channel) will have a separate connection to something with its own cache and buffering. Issuing a flush in Ceph requires waiting until a quorum of backends (OSDs) has been written to, and with a number of backends substantially greater than the quorum, it is not unlikely that a flush on one channel will not wait for writes on what Ceph considers a completely independent channel to have fully written (assuming the write completes before the flush is done).
>
> The same would happen pretty trivially with a forking server that uses a process-space write-back cache.
>
> This is because the spec when the spec says: "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH." what it currently means is actually "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH THAT CLIENT*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH".
>
> So what we would need the spec to mean is "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CHANNEL OF THAT CLIENT*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH". And as we have no way to associate different channels of the same client, for servers that can't rely on the OS to synchronise flushing across different clients relating to the same file, in practice that means "All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM) ***ASSOCIATED WITH ANY CLIENT AT ALL*** that the server completes (i.e. replies to) prior to processing to a NBD_CMD_FLUSH MUST be written to non-volatile storage prior to replying to that NBD_CMD_FLUSH" - i.e. a flush on any channel of any client must flush every channel of every client, because we have no easy way to tell which clients are in fact two channels. I have concerns over the scalability of this.
>
> Now, in the reference server, NBD_CMD_FLUSH is implemented through an fdatasync(). Each client (and therefore each channel) runs in a different process.
>
> Earlier in this thread, someone suggested that if this happens:
>
>       Process A                Process B
>       =========                =========
>
>       fd1=open("file123")
>                                fd2=open("file123")
>
>       write(fd1, ...)
>                                fdatasync("fd2")
>
> then the fdatasync() is guaranteed to sync the write that Process A has written. This may or may not be the case under Linux (wiser minds than me will know). Is it guaranteed to be the case with (e.g.) the file on NFS? On all POSIX platforms?
>
> Looking at https://urldefense.proofpoint.com/v2/url?u=http-3A__pubs.opengroup.org_onlinepubs_009695399_functions_fdatasync.html&d=DQIFAg&c=5VD0RTtNlTh3ycd41b3MUw&r=sDzg6MvHymKOUgI8SFIm4Q&m=CrKTswZ5fz5tdtZvA9rerZHXb8O8O57LSOjNJN1ejms&s=bkOpj64mHN60JXapJ62GJe0Qtzp-ZWwVn91kXmJ247M&e=  I'd say it was a little ambiguous as to whether it will ALWAYS flush all data associated with the file even if it is being written by a different process (and a different FD).
>
> If fdatasync() is always guaranteed to flush data associated with writes by a different process (and separately opened fd), then as it happens there won't be a problem on the reference server, just on servers that don't happen to use fdatasync() or similar to implement flushes, and which don't maintain their own caches. If fdatasync() is not so guaranteed, we have a problem with the reference server too, at least on some platforms and fling systems.
>
> What I'm therefore asking for is either:
> a) that the server can say 'if you are multichannel, you will need to send flush on each channel' (best); OR
> b) that the server can say 'don't go multichannel'
>
> as part of the negotiation stage. Of course as this is dependent on the backend, this is going to be something that is per-target (i.e. needs to come as a transmission flag or similar).
>

Ok I understand your objections now.  You aren't arguing that we are unsafe by 
default, only that we are unsafe with servers that do something special beyond 
simply writing to a single disk or file.  I agree this is problematic, but you 
simply don't use this feature if your server can't deal with it well.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03 14:32                       ` Josef Bacik
@ 2016-10-03 14:46                         ` Alex Bligh
  -1 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-03 14:46 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Christoph Hellwig, Wouter Verhelst, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel

Josef,

> On 3 Oct 2016, at 15:32, Josef Bacik <jbacik@fb.com> wrote:
>=20
> Ok I understand your objections now.  You aren't arguing that we are =
unsafe by default, only that we are unsafe with servers that do =
something special beyond simply writing to a single disk or file.  I =
agree this is problematic, but you simply don't use this feature if your =
server can't deal with it well.

Not quite. I am arguing that:

1. Parallel channels as currently implemented are inherently unsafe on =
some servers - I have given an example of such servers

2. Parallel channels as currently implemented may be unsafe on the =
reference server on some or all platforms (depending on the behaviour of =
fdatasync() which might varies between platforms)

3. Either the parallel channels stuff should issue flush requests on all =
channels, or the protocol should protect the unwitting user by =
negotiating an option to do so (or refusing multi-channel connects). It =
is not reasonable for an nbd client to have to know the intimate details =
of the server and its implementation of synchronisation primitives - =
saying 'the user should disable multiple channels' is not good enough, =
as this is what we have a protocol for.

"unsafe" here means 'do not conform to the kernel's requirements for =
flush semantics, whereas they did without multiple channels'

So from my point of view either we need to have an additional connection =
flag ("don't use multiple channels unless you are going to issue a flush =
on all channels") OR flush on all channels should be the default.

Whatever SOME more work needs to be done on your patch because there it =
current doesn't issue flush on all channels, and it currently does not =
have any way to prevent use of multiple channels.


I'd really like someone with linux / POSIX knowledge to confirm the =
linux and POSIX position on fdatasync of an fd opened by one process on =
writes made to the same file by another process. If on all POSIX =
platforms and all filing systems this is guaranteed to flush the second =
platform's writes, then I think we could argue "OK there may be some =
weird servers which might not support multiple channels and they just =
need a way of signalling that". If on the other hand there is no such =
cross platform guarantee, I think this means in essence even with the =
reference server, this patch is unsafe, and it needs adapting to send =
flushes on all channels - yes it might theoretically be possible to =
introduce IPC to the current server, but you'd still need some way of =
tying together channels from a single client.

--=20
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03 14:46                         ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-03 14:46 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alex Bligh, Christoph Hellwig, Wouter Verhelst, Jens Axboe,
	linux-block, Kernel Team, nbd-general, linux-kernel

Josef,

> On 3 Oct 2016, at 15:32, Josef Bacik <jbacik@fb.com> wrote:
> 
> Ok I understand your objections now.  You aren't arguing that we are unsafe by default, only that we are unsafe with servers that do something special beyond simply writing to a single disk or file.  I agree this is problematic, but you simply don't use this feature if your server can't deal with it well.

Not quite. I am arguing that:

1. Parallel channels as currently implemented are inherently unsafe on some servers - I have given an example of such servers

2. Parallel channels as currently implemented may be unsafe on the reference server on some or all platforms (depending on the behaviour of fdatasync() which might varies between platforms)

3. Either the parallel channels stuff should issue flush requests on all channels, or the protocol should protect the unwitting user by negotiating an option to do so (or refusing multi-channel connects). It is not reasonable for an nbd client to have to know the intimate details of the server and its implementation of synchronisation primitives - saying 'the user should disable multiple channels' is not good enough, as this is what we have a protocol for.

"unsafe" here means 'do not conform to the kernel's requirements for flush semantics, whereas they did without multiple channels'

So from my point of view either we need to have an additional connection flag ("don't use multiple channels unless you are going to issue a flush on all channels") OR flush on all channels should be the default.

Whatever SOME more work needs to be done on your patch because there it current doesn't issue flush on all channels, and it currently does not have any way to prevent use of multiple channels.


I'd really like someone with linux / POSIX knowledge to confirm the linux and POSIX position on fdatasync of an fd opened by one process on writes made to the same file by another process. If on all POSIX platforms and all filing systems this is guaranteed to flush the second platform's writes, then I think we could argue "OK there may be some weird servers which might not support multiple channels and they just need a way of signalling that". If on the other hand there is no such cross platform guarantee, I think this means in essence even with the reference server, this patch is unsafe, and it needs adapting to send flushes on all channels - yes it might theoretically be possible to introduce IPC to the current server, but you'd still need some way of tying together channels from a single client.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03 11:34                     ` Alex Bligh
@ 2016-10-03 21:07                       ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03 21:07 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, nbd-general, Jens Axboe, Josef Bacik,
	linux-kernel, linux-block, Kernel Team

Alex,
Christoph,

On Mon, Oct 03, 2016 at 12:34:33PM +0100, Alex Bligh wrote:
> On 3 Oct 2016, at 08:57, Christoph Hellwig <hch@infradead.org> wrote:
> >> Can you clarify what you mean by that? Why is it an "odd flush
> >> definition", and how would you "properly" define it?
> > 
> > E.g. take the defintion from NVMe which also supports multiple queues:
> > 
> > "The Flush command shall commit data and metadata associated with the
> > specified namespace(s) to non-volatile media. The flush applies to all
> > commands completed prior to the submission of the Flush command.

Oh, prior to submission? Okay, that means I must have misunderstood what
you meant before; in that case there shouldn't be a problem.

> > The controller may also flush additional data and/or metadata from any
> > namespace."
> > 
> > The focus is completed - we need to get a reply to the host first
> > before we can send the flush command, so anything that we require
> > to be flushed needs to explicitly be completed first.
> 
> I think there are two separate issues here:
> 
> a) What's described as the "HOL blocking issue". 
> 
> This comes down to what Wouter said here:
> 
> > Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
> > when a FLUSH is sent over one channel, and the reply comes in, that all
> > commands which have been received, regardless of which channel they were
> > received over, have reched disk.
> > 
> > [1] Message-ID: <20160915122304.GA15501@infradead.org>
> > 
> > It is impossible for nbd to make such a guarantee, due to head-of-line
> > blocking on TCP.
> 
> this is perfectly accurate as far as it goes, but this isn't the current
> NBD definition of 'flush'.

I didn't read it that way.

> That is (from the docs):
> 
> > All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM)
> > that the server completes (i.e. replies to) prior to processing to a
> > NBD_CMD_FLUSH MUST be written to non-volatile storage prior to
> > replying to thatNBD_CMD_FLUSH.

This is somewhat ambiguous, in that (IMO) it doesn't clearly state the
point where the cutoff of "may not be on disk yet" is. What is
"processing"? We don't define that, and therefore it could be any point
between "receipt of the request message" and "sending the reply
message". I had interpreted it closer to the latter than was apparently
intended, but that isn't very useful; I see now that it should be closer
to the former; a more useful definition is probably something along the
following lines:

    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
    for which a reply was received on the client side prior to the
    transmission of the NBD_CMD_FLUSH message MUST be written to
    non-volatile storage prior to replying to that NBD_CMD_FLUSH. A
    server MAY process this command in ways that result committing more
    data to non-volatile storage than is strictly required.

[...]
> I don't think there is actually a problem here - Wouter if I'm wrong
> about this, I'd like to understand your argument better.

No, I now see that there isn't, and I misunderstood things. However, I
do think we should update the spec to clarify this.

> b) What I'm describing - which is the lack of synchronisation between
> channels.
[... long explanation snipped...]

Yes, and I acknowledge that. However, I think that should not be a
blocker. It's fine to mark this feature as experimental; it will not
ever be required to use multiple connections to connect to a server.

When this feature lands in nbd-client, I plan to ensure that the man
page and -help output says something along the following lines:

 use N connections to connect to the NBD server, improving performance
 at the cost of a possible loss of reliability.

 The interactions between multiple connections and the NBD_CMD_FLUSH
 command, especially when the actual storage and the NBD server are not
 physically on the same machine, are not currently well defined and not
 completely understood, and therefore the use of multiple connections to
 the same server could theoretically lead to data corruption and/or
 loss. Use with caution.

This probably needs some better wording, but you get the idea.

(also, this will interact really really badly with the reference
implementation's idea of copy-on-write, I suppose, since that implements
COW on a per-socket basis with a per-IP diff file...)

Anyway, the point is that this is a feature which may still cause
problems in a number of edge cases and which should therefore not be the
default yet, but which can be useful in a number of common situations
for which NBD is used today.

[...]
> Now, in the reference server, NBD_CMD_FLUSH is implemented through an
> fdatasync().

Actually, no, the reference server uses fsync() for reasons that I've
forgotten (side note: you wrote it that way ;-)

[...]

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-03 21:07                       ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-03 21:07 UTC (permalink / raw)
  To: Alex Bligh
  Cc: Christoph Hellwig, nbd-general, Jens Axboe, Josef Bacik,
	linux-kernel, linux-block, Kernel Team

Alex,
Christoph,

On Mon, Oct 03, 2016 at 12:34:33PM +0100, Alex Bligh wrote:
> On 3 Oct 2016, at 08:57, Christoph Hellwig <hch@infradead.org> wrote:
> >> Can you clarify what you mean by that? Why is it an "odd flush
> >> definition", and how would you "properly" define it?
> > 
> > E.g. take the defintion from NVMe which also supports multiple queues:
> > 
> > "The Flush command shall commit data and metadata associated with the
> > specified namespace(s) to non-volatile media. The flush applies to all
> > commands completed prior to the submission of the Flush command.

Oh, prior to submission? Okay, that means I must have misunderstood what
you meant before; in that case there shouldn't be a problem.

> > The controller may also flush additional data and/or metadata from any
> > namespace."
> > 
> > The focus is completed - we need to get a reply to the host first
> > before we can send the flush command, so anything that we require
> > to be flushed needs to explicitly be completed first.
> 
> I think there are two separate issues here:
> 
> a) What's described as the "HOL blocking issue". 
> 
> This comes down to what Wouter said here:
> 
> > Well, when I asked earlier, Christoph said[1] that blk-mq assumes that
> > when a FLUSH is sent over one channel, and the reply comes in, that all
> > commands which have been received, regardless of which channel they were
> > received over, have reched disk.
> > 
> > [1] Message-ID: <20160915122304.GA15501@infradead.org>
> > 
> > It is impossible for nbd to make such a guarantee, due to head-of-line
> > blocking on TCP.
> 
> this is perfectly accurate as far as it goes, but this isn't the current
> NBD definition of 'flush'.

I didn't read it that way.

> That is (from the docs):
> 
> > All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM)
> > that the server completes (i.e. replies to) prior to processing to a
> > NBD_CMD_FLUSH MUST be written to non-volatile storage prior to
> > replying to thatNBD_CMD_FLUSH.

This is somewhat ambiguous, in that (IMO) it doesn't clearly state the
point where the cutoff of "may not be on disk yet" is. What is
"processing"? We don't define that, and therefore it could be any point
between "receipt of the request message" and "sending the reply
message". I had interpreted it closer to the latter than was apparently
intended, but that isn't very useful; I see now that it should be closer
to the former; a more useful definition is probably something along the
following lines:

    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
    for which a reply was received on the client side prior to the
    transmission of the NBD_CMD_FLUSH message MUST be written to
    non-volatile storage prior to replying to that NBD_CMD_FLUSH. A
    server MAY process this command in ways that result committing more
    data to non-volatile storage than is strictly required.

[...]
> I don't think there is actually a problem here - Wouter if I'm wrong
> about this, I'd like to understand your argument better.

No, I now see that there isn't, and I misunderstood things. However, I
do think we should update the spec to clarify this.

> b) What I'm describing - which is the lack of synchronisation between
> channels.
[... long explanation snipped...]

Yes, and I acknowledge that. However, I think that should not be a
blocker. It's fine to mark this feature as experimental; it will not
ever be required to use multiple connections to connect to a server.

When this feature lands in nbd-client, I plan to ensure that the man
page and -help output says something along the following lines:

 use N connections to connect to the NBD server, improving performance
 at the cost of a possible loss of reliability.

 The interactions between multiple connections and the NBD_CMD_FLUSH
 command, especially when the actual storage and the NBD server are not
 physically on the same machine, are not currently well defined and not
 completely understood, and therefore the use of multiple connections to
 the same server could theoretically lead to data corruption and/or
 loss. Use with caution.

This probably needs some better wording, but you get the idea.

(also, this will interact really really badly with the reference
implementation's idea of copy-on-write, I suppose, since that implements
COW on a per-socket basis with a per-IP diff file...)

Anyway, the point is that this is a feature which may still cause
problems in a number of edge cases and which should therefore not be the
default yet, but which can be useful in a number of common situations
for which NBD is used today.

[...]
> Now, in the reference server, NBD_CMD_FLUSH is implemented through an
> fdatasync().

Actually, no, the reference server uses fsync() for reasons that I've
forgotten (side note: you wrote it that way ;-)

[...]

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-03 21:07                       ` Wouter Verhelst
@ 2016-10-04  9:35                         ` Alex Bligh
  -1 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-04  9:35 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Christoph Hellwig, nbd-general, Jens Axboe,
	Josef Bacik, linux-kernel, linux-block, Kernel Team

Wouter,

>>> It is impossible for nbd to make such a guarantee, due to head-of-line
>>> blocking on TCP.
>> 
>> this is perfectly accurate as far as it goes, but this isn't the current
>> NBD definition of 'flush'.
> 
> I didn't read it that way.
> 
>> That is (from the docs):
>> 
>>> All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM)
>>> that the server completes (i.e. replies to) prior to processing to a
>>> NBD_CMD_FLUSH MUST be written to non-volatile storage prior to
>>> replying to thatNBD_CMD_FLUSH.
> 
> This is somewhat ambiguous, in that (IMO) it doesn't clearly state the
> point where the cutoff of "may not be on disk yet" is. What is
> "processing"?

OK. I now get the problem. There are actually two types of HOL blocking,
server to client and client to server.

Before we allowed the server to issue replies out of order with requests.
However, the protocol did guarantee that the server saw requests in the
order presented by clients. With the proposed multi-connection support,
this changes. Whilst the client needed to be prepared for things to be
disordered by the server, the server did not previously need to be
be prepared for things being disordered by the client. And (more subtly)
the client could assume that the server got its own requests in the
order it sent them, which is important for flush the way written at the
moment.


Here's an actual illustration of the problem:

Currently we have:

     Client              Server
     ======              ======

     TX: WRITE
     RX: FLUSH
                         RX: WRITE
                         RX: FLUSH
                                                
                         Process write
                      
                         Process flush including write

                         TX: write reply
                         TX: flush reply
     RX: write reply
     RX: flush reply


Currently the RX statements cannot be disordered. However the
server can process the requests in a different order. If it
does, the flush need not include the write, like this:

     Client              Server
     ======              ======

     TX: WRITE
     RX: FLUSH
                         RX: WRITE
                         RX: FLUSH
                                                
                         Process flush not including write

                         Process write                      

                         TX: flush reply
                         TX: write reply
    RX: flush reply
    RX: write reply

and the client gets to know of the fact, because the flush
reply comes before the write reply. It can know it's data has
not been flushed. It could send another flush in this case, or
simply change its code to not send the flush until the write
has been received.

However, with the multi-connection support, both the replies
and the requests can be disordered. So the client can ONLY
know a flush has been completed if it has received a reply
to the write before it sends the flush.

This is in my opinion problematic, as what you want to do as
a client is stream requests (write, write, write, flush, write,
write, write). If those go down different channels, AND you
don't wait for a reply, you can no longer safely stream requests
at all. Now you need to wait for the flush request to respond
before sending another write (if write ordering to the platter
is important), which seems to defeat the object of streaming
commands.

An 'in extremis' example would be a sequence of write / flush
requests sent down two channels, where the write requests all
end up on one channel, and the flush requests on the other,
and the write channel is serviced immediately and the flush
requests delayed indefinitely.

> We don't define that, and therefore it could be any point
> between "receipt of the request message" and "sending the reply
> message". I had interpreted it closer to the latter than was apparently
> intended, but that isn't very useful;

The thing is the server doesn't know what replies the client has
received, only the replies it has sent. Equally the server doesn't
know what commands the client has sent, only what commands it has
received.

As currently written, it's a simple rule, NBD_CMD_FLUSH means
"Mr Server: you must make sure that any write you have sent a
reply to must now be persisted on disk. If you haven't yet sent
a reply to a write - perhaps because due to HOL blocking you
haven't received it, or perhaps it's still in progress, or perhaps
it's finished but you haven't sent the reply - don't worry".

The promise to the the client is that all the writes to which the
server has sent a reply are now on disk. But the client doesn't
know what replies the server has sent a reply to. It only knows
which replies it has received (which will be a subset of those). So
to the client it means that the server has persisted to disk all
those commands to which it has received a reply. However, to the
server, the 'MUST' condition needs to refer to the replies it sent,
not the replies the client receives.

I think "before processing" here really just means "before sending
a reply to the NBD_CMD_FLUSH". I believe the 'before processing'
phrase came from the kernel wording.

> I see now that it should be closer
> to the former; a more useful definition is probably something along the
> following lines:
> 
>    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
>    for which a reply was received on the client side prior to the

No, that's wrong as the server has no knowledge of whether the client
has actually received them so no way of knowing to which writes that
would reply. It thus has to ensure it covers a potentially larger
set of writes - those for which a reply has been sent, as all those
MIGHT have been received by the client.

>    transmission of the NBD_CMD_FLUSH message MUST be written to

no that's wrong because the server has no idea when the client
transmitted the NBD_CMD_FLUSH message. It can only deal with
events it knows about. And the delimiter is (in essence) those
write commands that were replied to prior to the sending of the
reply to the NBD_CMD_FLUSH - of course it can flush others too.

>    non-volatile storage prior to replying to that NBD_CMD_FLUSH. A
>    server MAY process this command in ways that result committing more
>    data to non-volatile storage than is strictly required.

I think the wording is basically right for the current semantic,
but here's a slight improvement:

   The server MUST NOT send a non-error reply to NBD_CMD_FLUSH
   until it has ensured that the contents of all writes to which it
   has already completed (i.e. replied to) have been persisted
   to non-volatile storage.

However, given that the replies can subsequently be reordered, I
now think we do have a problem, as the client can't tell to which
those refer.

> [...]
>> I don't think there is actually a problem here - Wouter if I'm wrong
>> about this, I'd like to understand your argument better.
> 
> No, I now see that there isn't, and I misunderstood things. However, I
> do think we should update the spec to clarify this.

Haha - I now think there is. You accidentally convinced me!

>> b) What I'm describing - which is the lack of synchronisation between
>> channels.
> [... long explanation snipped...]
> 
> Yes, and I acknowledge that. However, I think that should not be a
> blocker. It's fine to mark this feature as experimental; it will not
> ever be required to use multiple connections to connect to a server.
> 
> When this feature lands in nbd-client, I plan to ensure that the man
> page and -help output says something along the following lines:
> 
> use N connections to connect to the NBD server, improving performance
> at the cost of a possible loss of reliability.

So in essence we are relying on (userspace) nbd-client not to open
more connections if it's unsafe? IE we can sort out all the negotiation
of whether it's safe or unsafe within userspace and not bother Josef
about it? I suppose that's fine in that we can at least shorten
the CC: line, but I still think it would be helpful if the protocol

>> Now, in the reference server, NBD_CMD_FLUSH is implemented through an
>> fdatasync().
> 
> Actually, no, the reference server uses fsync() for reasons that I've
> forgotten (side note: you wrote it that way ;-)
> 
> [...]

I vaguely remember why - something to do with files expanding when
holes were written to. However, I don't think that makes much
difference to the question I asked, or at most s/fdatasync/fsync/g

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-04  9:35                         ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-04  9:35 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, Christoph Hellwig, nbd-general, Jens Axboe,
	Josef Bacik, linux-kernel, linux-block, Kernel Team

Wouter,

>>> It is impossible for nbd to make such a guarantee, due to head-of-line
>>> blocking on TCP.
>> 
>> this is perfectly accurate as far as it goes, but this isn't the current
>> NBD definition of 'flush'.
> 
> I didn't read it that way.
> 
>> That is (from the docs):
>> 
>>> All write commands (that includes NBD_CMD_WRITE, and NBD_CMD_TRIM)
>>> that the server completes (i.e. replies to) prior to processing to a
>>> NBD_CMD_FLUSH MUST be written to non-volatile storage prior to
>>> replying to thatNBD_CMD_FLUSH.
> 
> This is somewhat ambiguous, in that (IMO) it doesn't clearly state the
> point where the cutoff of "may not be on disk yet" is. What is
> "processing"?

OK. I now get the problem. There are actually two types of HOL blocking,
server to client and client to server.

Before we allowed the server to issue replies out of order with requests.
However, the protocol did guarantee that the server saw requests in the
order presented by clients. With the proposed multi-connection support,
this changes. Whilst the client needed to be prepared for things to be
disordered by the server, the server did not previously need to be
be prepared for things being disordered by the client. And (more subtly)
the client could assume that the server got its own requests in the
order it sent them, which is important for flush the way written at the
moment.


Here's an actual illustration of the problem:

Currently we have:

     Client              Server
     ======              ======

     TX: WRITE
     RX: FLUSH
                         RX: WRITE
                         RX: FLUSH
                                                
                         Process write
                      
                         Process flush including write

                         TX: write reply
                         TX: flush reply
     RX: write reply
     RX: flush reply


Currently the RX statements cannot be disordered. However the
server can process the requests in a different order. If it
does, the flush need not include the write, like this:

     Client              Server
     ======              ======

     TX: WRITE
     RX: FLUSH
                         RX: WRITE
                         RX: FLUSH
                                                
                         Process flush not including write

                         Process write                      

                         TX: flush reply
                         TX: write reply
    RX: flush reply
    RX: write reply

and the client gets to know of the fact, because the flush
reply comes before the write reply. It can know it's data has
not been flushed. It could send another flush in this case, or
simply change its code to not send the flush until the write
has been received.

However, with the multi-connection support, both the replies
and the requests can be disordered. So the client can ONLY
know a flush has been completed if it has received a reply
to the write before it sends the flush.

This is in my opinion problematic, as what you want to do as
a client is stream requests (write, write, write, flush, write,
write, write). If those go down different channels, AND you
don't wait for a reply, you can no longer safely stream requests
at all. Now you need to wait for the flush request to respond
before sending another write (if write ordering to the platter
is important), which seems to defeat the object of streaming
commands.

An 'in extremis' example would be a sequence of write / flush
requests sent down two channels, where the write requests all
end up on one channel, and the flush requests on the other,
and the write channel is serviced immediately and the flush
requests delayed indefinitely.

> We don't define that, and therefore it could be any point
> between "receipt of the request message" and "sending the reply
> message". I had interpreted it closer to the latter than was apparently
> intended, but that isn't very useful;

The thing is the server doesn't know what replies the client has
received, only the replies it has sent. Equally the server doesn't
know what commands the client has sent, only what commands it has
received.

As currently written, it's a simple rule, NBD_CMD_FLUSH means
"Mr Server: you must make sure that any write you have sent a
reply to must now be persisted on disk. If you haven't yet sent
a reply to a write - perhaps because due to HOL blocking you
haven't received it, or perhaps it's still in progress, or perhaps
it's finished but you haven't sent the reply - don't worry".

The promise to the the client is that all the writes to which the
server has sent a reply are now on disk. But the client doesn't
know what replies the server has sent a reply to. It only knows
which replies it has received (which will be a subset of those). So
to the client it means that the server has persisted to disk all
those commands to which it has received a reply. However, to the
server, the 'MUST' condition needs to refer to the replies it sent,
not the replies the client receives.

I think "before processing" here really just means "before sending
a reply to the NBD_CMD_FLUSH". I believe the 'before processing'
phrase came from the kernel wording.

> I see now that it should be closer
> to the former; a more useful definition is probably something along the
> following lines:
> 
>    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
>    for which a reply was received on the client side prior to the

No, that's wrong as the server has no knowledge of whether the client
has actually received them so no way of knowing to which writes that
would reply. It thus has to ensure it covers a potentially larger
set of writes - those for which a reply has been sent, as all those
MIGHT have been received by the client.

>    transmission of the NBD_CMD_FLUSH message MUST be written to

no that's wrong because the server has no idea when the client
transmitted the NBD_CMD_FLUSH message. It can only deal with
events it knows about. And the delimiter is (in essence) those
write commands that were replied to prior to the sending of the
reply to the NBD_CMD_FLUSH - of course it can flush others too.

>    non-volatile storage prior to replying to that NBD_CMD_FLUSH. A
>    server MAY process this command in ways that result committing more
>    data to non-volatile storage than is strictly required.

I think the wording is basically right for the current semantic,
but here's a slight improvement:

   The server MUST NOT send a non-error reply to NBD_CMD_FLUSH
   until it has ensured that the contents of all writes to which it
   has already completed (i.e. replied to) have been persisted
   to non-volatile storage.

However, given that the replies can subsequently be reordered, I
now think we do have a problem, as the client can't tell to which
those refer.

> [...]
>> I don't think there is actually a problem here - Wouter if I'm wrong
>> about this, I'd like to understand your argument better.
> 
> No, I now see that there isn't, and I misunderstood things. However, I
> do think we should update the spec to clarify this.

Haha - I now think there is. You accidentally convinced me!

>> b) What I'm describing - which is the lack of synchronisation between
>> channels.
> [... long explanation snipped...]
> 
> Yes, and I acknowledge that. However, I think that should not be a
> blocker. It's fine to mark this feature as experimental; it will not
> ever be required to use multiple connections to connect to a server.
> 
> When this feature lands in nbd-client, I plan to ensure that the man
> page and -help output says something along the following lines:
> 
> use N connections to connect to the NBD server, improving performance
> at the cost of a possible loss of reliability.

So in essence we are relying on (userspace) nbd-client not to open
more connections if it's unsafe? IE we can sort out all the negotiation
of whether it's safe or unsafe within userspace and not bother Josef
about it? I suppose that's fine in that we can at least shorten
the CC: line, but I still think it would be helpful if the protocol

>> Now, in the reference server, NBD_CMD_FLUSH is implemented through an
>> fdatasync().
> 
> Actually, no, the reference server uses fsync() for reasons that I've
> forgotten (side note: you wrote it that way ;-)
> 
> [...]

I vaguely remember why - something to do with files expanding when
holes were written to. However, I don't think that makes much
difference to the question I asked, or at most s/fdatasync/fsync/g

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-04  9:35                         ` Alex Bligh
@ 2016-10-06  9:04                           ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06  9:04 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

Hi Alex,

On Tue, Oct 04, 2016 at 10:35:03AM +0100, Alex Bligh wrote:
> Wouter,
> > I see now that it should be closer
> > to the former; a more useful definition is probably something along the
> > following lines:
> > 
> >    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
> >    for which a reply was received on the client side prior to the
> 
> No, that's wrong as the server has no knowledge of whether the client
> has actually received them so no way of knowing to which writes that
> would reply.

I realise that, but I don't think it's a problem.

In the current situation, a client could opportunistically send a number
of write requests immediately followed by a flush and hope for the best.
However, in that case there is no guarantee that for the write requests
that the client actually cares about to have hit the disk, a reply
arrives on the client side before the flush reply arrives. If that
doesn't happen, that would then mean the client would have to issue
another flush request, probably at a performance hit.

As I understand Christoph's explanations, currently the Linux kernel
*doesn't* issue flush requests unless and until the necessary writes
have already completed (i.e., the reply has been received and processed
on the client side). Given that, given the issue in the previous
paragraph, and given the uncertainty introduced with multiple
connections, I think it is reasonable to say that a client should just
not assume a flush touches anything except for the writes for which it
has already received a reply by the time the flush request is sent out.

Those are semantics that are actually useful and can be guaranteed in
the face of multiple connections. Other semantics can not.

It is indeed impossible for a server to know what has been received by
the client by the time it (the client) sent out the flush request.
However, the server doesn't need that information, at all. The flush
request's semantics do not say that any request not covered by the flush
request itself MUST NOT have hit disk; instead, it just says that there
is no guarantee on whether or not that is the case. That's fine; all a
server needs to know is that when it receives a flush, it needs to
fsync() or some such, and then send the reply. All a *client* needs to
know is which requests have most definitely hit the disk. In my
proposal, those are the requests that finished before the flush request
was sent, and not the requests that finished between that and when the
flush reply is received. Those are *likely* to also be covered
(especially on single-connection NBD setups), but in my proposal,
they're no longer *guaranteed* to be.

Christoph: just to double-check: would such semantics be incompatible
with the semantics that the Linux kernel expects of block devices? If
so, we'll have to review. Otherwise, I think we should go with that.

[...]
> >> b) What I'm describing - which is the lack of synchronisation between
> >> channels.
> > [... long explanation snipped...]
> > 
> > Yes, and I acknowledge that. However, I think that should not be a
> > blocker. It's fine to mark this feature as experimental; it will not
> > ever be required to use multiple connections to connect to a server.
> > 
> > When this feature lands in nbd-client, I plan to ensure that the man
> > page and -help output says something along the following lines:
> > 
> > use N connections to connect to the NBD server, improving performance
> > at the cost of a possible loss of reliability.
> 
> So in essence we are relying on (userspace) nbd-client not to open
> more connections if it's unsafe? IE we can sort out all the negotiation
> of whether it's safe or unsafe within userspace and not bother Josef
> about it?

Yes, exactly.

> I suppose that's fine in that we can at least shorten the CC: line,
> but I still think it would be helpful if the protocol

unfinished sentence here...

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06  9:04                           ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06  9:04 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

Hi Alex,

On Tue, Oct 04, 2016 at 10:35:03AM +0100, Alex Bligh wrote:
> Wouter,
> > I see now that it should be closer
> > to the former; a more useful definition is probably something along the
> > following lines:
> > 
> >    All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
> >    for which a reply was received on the client side prior to the
> 
> No, that's wrong as the server has no knowledge of whether the client
> has actually received them so no way of knowing to which writes that
> would reply.

I realise that, but I don't think it's a problem.

In the current situation, a client could opportunistically send a number
of write requests immediately followed by a flush and hope for the best.
However, in that case there is no guarantee that for the write requests
that the client actually cares about to have hit the disk, a reply
arrives on the client side before the flush reply arrives. If that
doesn't happen, that would then mean the client would have to issue
another flush request, probably at a performance hit.

As I understand Christoph's explanations, currently the Linux kernel
*doesn't* issue flush requests unless and until the necessary writes
have already completed (i.e., the reply has been received and processed
on the client side). Given that, given the issue in the previous
paragraph, and given the uncertainty introduced with multiple
connections, I think it is reasonable to say that a client should just
not assume a flush touches anything except for the writes for which it
has already received a reply by the time the flush request is sent out.

Those are semantics that are actually useful and can be guaranteed in
the face of multiple connections. Other semantics can not.

It is indeed impossible for a server to know what has been received by
the client by the time it (the client) sent out the flush request.
However, the server doesn't need that information, at all. The flush
request's semantics do not say that any request not covered by the flush
request itself MUST NOT have hit disk; instead, it just says that there
is no guarantee on whether or not that is the case. That's fine; all a
server needs to know is that when it receives a flush, it needs to
fsync() or some such, and then send the reply. All a *client* needs to
know is which requests have most definitely hit the disk. In my
proposal, those are the requests that finished before the flush request
was sent, and not the requests that finished between that and when the
flush reply is received. Those are *likely* to also be covered
(especially on single-connection NBD setups), but in my proposal,
they're no longer *guaranteed* to be.

Christoph: just to double-check: would such semantics be incompatible
with the semantics that the Linux kernel expects of block devices? If
so, we'll have to review. Otherwise, I think we should go with that.

[...]
> >> b) What I'm describing - which is the lack of synchronisation between
> >> channels.
> > [... long explanation snipped...]
> > 
> > Yes, and I acknowledge that. However, I think that should not be a
> > blocker. It's fine to mark this feature as experimental; it will not
> > ever be required to use multiple connections to connect to a server.
> > 
> > When this feature lands in nbd-client, I plan to ensure that the man
> > page and -help output says something along the following lines:
> > 
> > use N connections to connect to the NBD server, improving performance
> > at the cost of a possible loss of reliability.
> 
> So in essence we are relying on (userspace) nbd-client not to open
> more connections if it's unsafe? IE we can sort out all the negotiation
> of whether it's safe or unsafe within userspace and not bother Josef
> about it?

Yes, exactly.

> I suppose that's fine in that we can at least shorten the CC: line,
> but I still think it would be helpful if the protocol

unfinished sentence here...

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06  9:04                           ` Wouter Verhelst
@ 2016-10-06  9:41                             ` Alex Bligh
  -1 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-06  9:41 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

Wouter,

> On 6 Oct 2016, at 10:04, Wouter Verhelst <w@uter.be> wrote:
> 
> Hi Alex,
> 
> On Tue, Oct 04, 2016 at 10:35:03AM +0100, Alex Bligh wrote:
>> Wouter,
>>> I see now that it should be closer
>>> to the former; a more useful definition is probably something along the
>>> following lines:
>>> 
>>>   All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
>>>   for which a reply was received on the client side prior to the
>> 
>> No, that's wrong as the server has no knowledge of whether the client
>> has actually received them so no way of knowing to which writes that
>> would reply.
> 
> I realise that, but I don't think it's a problem.
> 
> In the current situation, a client could opportunistically send a number
> of write requests immediately followed by a flush and hope for the best.
> However, in that case there is no guarantee that for the write requests
> that the client actually cares about to have hit the disk, a reply
> arrives on the client side before the flush reply arrives. If that
> doesn't happen, that would then mean the client would have to issue
> another flush request, probably at a performance hit.

Sure, but the client knows (currently) that any write request which
it has a reply to before it receives the reply from the flush request
has been written to disk. Such a client might simply note whether it
has issued any subsequent write requests.

> As I understand Christoph's explanations, currently the Linux kernel
> *doesn't* issue flush requests unless and until the necessary writes
> have already completed (i.e., the reply has been received and processed
> on the client side).

Sure, but it is not the only client.

> Given that, given the issue in the previous
> paragraph, and given the uncertainty introduced with multiple
> connections, I think it is reasonable to say that a client should just
> not assume a flush touches anything except for the writes for which it
> has already received a reply by the time the flush request is sent out.

OK. So you are proposing weakening the semantic for flush (saying that
it is only guaranteed to cover those writes for which the client has
actually received a reply prior to sending the flush, as opposed to
prior to receiving the flush reply). This is based on the view that
the Linux kernel client wouldn't be affected, and if other clients
were affected, their behaviour would be 'somewhat unusual'.

We do have one significant other client out there that uses flush
which is Qemu. I think we should get a view on whether they would be
affected.

> Those are semantics that are actually useful and can be guaranteed in
> the face of multiple connections. Other semantics can not.

Well there is another semantic which would work just fine, and also
cures the other problem (synchronisation between channels) which would
be simply that flush is only guaranteed to affect writes issued on the
same channel. Then flush would do the natural thing, i.e. flush
all the writes that had been done *on that channel*.

> It is indeed impossible for a server to know what has been received by
> the client by the time it (the client) sent out the flush request.
> However, the server doesn't need that information, at all. The flush
> request's semantics do not say that any request not covered by the flush
> request itself MUST NOT have hit disk; instead, it just says that there
> is no guarantee on whether or not that is the case. That's fine; all a
> server needs to know is that when it receives a flush, it needs to
> fsync() or some such, and then send the reply. All a *client* needs to
> know is which requests have most definitely hit the disk. In my
> proposal, those are the requests that finished before the flush request
> was sent, and not the requests that finished between that and when the
> flush reply is received. Those are *likely* to also be covered
> (especially on single-connection NBD setups), but in my proposal,
> they're no longer *guaranteed* to be.

I think my objection was more that you were writing mandatory language
for a server's behaviour based on what the client perceives.

What you are saying from the client's point of view is that it under
your proposed change it can only rely on that writes in respect of
which the reply has been received prior to issuing the flush are persisted
to disk (more might be persisted, but the client can't rely on it).

So far so good.

However, I don't think you can usefully make the guarantee weaker from the
SERVER'S point of view, because it doesn't know how things got reordered.
IE it still needs to persist to disk any write that it has completed
when it processes the flush. Yes, the client doesn't get the same guarantee,
but it can't know whether it can be slacker about a particular write
which it has performed but for which the client didn't receive the reply
prior to issuing the flush - it must just assume that if it did send
the reply prior to issuing the flush (or even queue it to be sent) then
it MIGHT have arrived prior to the flush being issued.

IE I don't actually think the wording from the server side needs changing
now I see what you are trying to do. Just we need a new paragraph saying
what the client can and cannot reply on.

> Christoph: just to double-check: would such semantics be incompatible
> with the semantics that the Linux kernel expects of block devices? If
> so, we'll have to review. Otherwise, I think we should go with that.

It would also really be nice to know whether there is any way the
flushes could be linked to the channel(s) containing the writes to which
they belong - this would solve the issues with coherency between channels.

Equally no one has answered the question as to whether fsync/fdatasync
is guaranteed (especially when not on Linux, not on a block FS) to give
synchronisation when different processes have different FDs open on the
same file. Is there some way to detect when this is safe?

> 
> [...]
>>>> b) What I'm describing - which is the lack of synchronisation between
>>>> channels.
>>> [... long explanation snipped...]
>>> 
>>> Yes, and I acknowledge that. However, I think that should not be a
>>> blocker. It's fine to mark this feature as experimental; it will not
>>> ever be required to use multiple connections to connect to a server.
>>> 
>>> When this feature lands in nbd-client, I plan to ensure that the man
>>> page and -help output says something along the following lines:
>>> 
>>> use N connections to connect to the NBD server, improving performance
>>> at the cost of a possible loss of reliability.
>> 
>> So in essence we are relying on (userspace) nbd-client not to open
>> more connections if it's unsafe? IE we can sort out all the negotiation
>> of whether it's safe or unsafe within userspace and not bother Josef
>> about it?
> 
> Yes, exactly.
> 
>> I suppose that's fine in that we can at least shorten the CC: line,
>> but I still think it would be helpful if the protocol
> 
> unfinished sentence here...

... but I still think it would be helpful if the protocol helped out
the end user of the client and refused to negotiate multichannel
connections when they are unsafe. How is the end client meant to know
whether the back end is not on Linux, not on a block device, done
via a Ceph driver etc?

I still think it's pretty damn awkward that with a ceph back end
(for instance) which would be one of the backends to benefit the
most from multichannel connections (as it's inherently parallel),
no one has explained how flush could be done safely.

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06  9:41                             ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-06  9:41 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

Wouter,

> On 6 Oct 2016, at 10:04, Wouter Verhelst <w@uter.be> wrote:
> 
> Hi Alex,
> 
> On Tue, Oct 04, 2016 at 10:35:03AM +0100, Alex Bligh wrote:
>> Wouter,
>>> I see now that it should be closer
>>> to the former; a more useful definition is probably something along the
>>> following lines:
>>> 
>>>   All write commands (that includes NBD_CMD_WRITE and NBD_CMD_TRIM)
>>>   for which a reply was received on the client side prior to the
>> 
>> No, that's wrong as the server has no knowledge of whether the client
>> has actually received them so no way of knowing to which writes that
>> would reply.
> 
> I realise that, but I don't think it's a problem.
> 
> In the current situation, a client could opportunistically send a number
> of write requests immediately followed by a flush and hope for the best.
> However, in that case there is no guarantee that for the write requests
> that the client actually cares about to have hit the disk, a reply
> arrives on the client side before the flush reply arrives. If that
> doesn't happen, that would then mean the client would have to issue
> another flush request, probably at a performance hit.

Sure, but the client knows (currently) that any write request which
it has a reply to before it receives the reply from the flush request
has been written to disk. Such a client might simply note whether it
has issued any subsequent write requests.

> As I understand Christoph's explanations, currently the Linux kernel
> *doesn't* issue flush requests unless and until the necessary writes
> have already completed (i.e., the reply has been received and processed
> on the client side).

Sure, but it is not the only client.

> Given that, given the issue in the previous
> paragraph, and given the uncertainty introduced with multiple
> connections, I think it is reasonable to say that a client should just
> not assume a flush touches anything except for the writes for which it
> has already received a reply by the time the flush request is sent out.

OK. So you are proposing weakening the semantic for flush (saying that
it is only guaranteed to cover those writes for which the client has
actually received a reply prior to sending the flush, as opposed to
prior to receiving the flush reply). This is based on the view that
the Linux kernel client wouldn't be affected, and if other clients
were affected, their behaviour would be 'somewhat unusual'.

We do have one significant other client out there that uses flush
which is Qemu. I think we should get a view on whether they would be
affected.

> Those are semantics that are actually useful and can be guaranteed in
> the face of multiple connections. Other semantics can not.

Well there is another semantic which would work just fine, and also
cures the other problem (synchronisation between channels) which would
be simply that flush is only guaranteed to affect writes issued on the
same channel. Then flush would do the natural thing, i.e. flush
all the writes that had been done *on that channel*.

> It is indeed impossible for a server to know what has been received by
> the client by the time it (the client) sent out the flush request.
> However, the server doesn't need that information, at all. The flush
> request's semantics do not say that any request not covered by the flush
> request itself MUST NOT have hit disk; instead, it just says that there
> is no guarantee on whether or not that is the case. That's fine; all a
> server needs to know is that when it receives a flush, it needs to
> fsync() or some such, and then send the reply. All a *client* needs to
> know is which requests have most definitely hit the disk. In my
> proposal, those are the requests that finished before the flush request
> was sent, and not the requests that finished between that and when the
> flush reply is received. Those are *likely* to also be covered
> (especially on single-connection NBD setups), but in my proposal,
> they're no longer *guaranteed* to be.

I think my objection was more that you were writing mandatory language
for a server's behaviour based on what the client perceives.

What you are saying from the client's point of view is that it under
your proposed change it can only rely on that writes in respect of
which the reply has been received prior to issuing the flush are persisted
to disk (more might be persisted, but the client can't rely on it).

So far so good.

However, I don't think you can usefully make the guarantee weaker from the
SERVER'S point of view, because it doesn't know how things got reordered.
IE it still needs to persist to disk any write that it has completed
when it processes the flush. Yes, the client doesn't get the same guarantee,
but it can't know whether it can be slacker about a particular write
which it has performed but for which the client didn't receive the reply
prior to issuing the flush - it must just assume that if it did send
the reply prior to issuing the flush (or even queue it to be sent) then
it MIGHT have arrived prior to the flush being issued.

IE I don't actually think the wording from the server side needs changing
now I see what you are trying to do. Just we need a new paragraph saying
what the client can and cannot reply on.

> Christoph: just to double-check: would such semantics be incompatible
> with the semantics that the Linux kernel expects of block devices? If
> so, we'll have to review. Otherwise, I think we should go with that.

It would also really be nice to know whether there is any way the
flushes could be linked to the channel(s) containing the writes to which
they belong - this would solve the issues with coherency between channels.

Equally no one has answered the question as to whether fsync/fdatasync
is guaranteed (especially when not on Linux, not on a block FS) to give
synchronisation when different processes have different FDs open on the
same file. Is there some way to detect when this is safe?

> 
> [...]
>>>> b) What I'm describing - which is the lack of synchronisation between
>>>> channels.
>>> [... long explanation snipped...]
>>> 
>>> Yes, and I acknowledge that. However, I think that should not be a
>>> blocker. It's fine to mark this feature as experimental; it will not
>>> ever be required to use multiple connections to connect to a server.
>>> 
>>> When this feature lands in nbd-client, I plan to ensure that the man
>>> page and -help output says something along the following lines:
>>> 
>>> use N connections to connect to the NBD server, improving performance
>>> at the cost of a possible loss of reliability.
>> 
>> So in essence we are relying on (userspace) nbd-client not to open
>> more connections if it's unsafe? IE we can sort out all the negotiation
>> of whether it's safe or unsafe within userspace and not bother Josef
>> about it?
> 
> Yes, exactly.
> 
>> I suppose that's fine in that we can at least shorten the CC: line,
>> but I still think it would be helpful if the protocol
> 
> unfinished sentence here...

... but I still think it would be helpful if the protocol helped out
the end user of the client and refused to negotiate multichannel
connections when they are unsafe. How is the end client meant to know
whether the back end is not on Linux, not on a block device, done
via a Ceph driver etc?

I still think it's pretty damn awkward that with a ceph back end
(for instance) which would be one of the backends to benefit the
most from multichannel connections (as it's inherently parallel),
no one has explained how flush could be done safely.

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06  9:41                             ` Alex Bligh
@ 2016-10-06 10:15                               ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 10:15 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, Christoph Hellwig, Josef Bacik, linux-kernel,
	Jens Axboe, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 10:41:36AM +0100, Alex Bligh wrote:
> Wouter,
[...]
> > Given that, given the issue in the previous
> > paragraph, and given the uncertainty introduced with multiple
> > connections, I think it is reasonable to say that a client should just
> > not assume a flush touches anything except for the writes for which it
> > has already received a reply by the time the flush request is sent out.
> 
> OK. So you are proposing weakening the semantic for flush (saying that
> it is only guaranteed to cover those writes for which the client has
> actually received a reply prior to sending the flush, as opposed to
> prior to receiving the flush reply). This is based on the view that
> the Linux kernel client wouldn't be affected, and if other clients
> were affected, their behaviour would be 'somewhat unusual'.

Right.

> We do have one significant other client out there that uses flush
> which is Qemu. I think we should get a view on whether they would be
> affected.

That's certainly something to consider, yes.

> > Those are semantics that are actually useful and can be guaranteed in
> > the face of multiple connections. Other semantics can not.
> 
> Well there is another semantic which would work just fine, and also
> cures the other problem (synchronisation between channels) which would
> be simply that flush is only guaranteed to affect writes issued on the
> same channel. Then flush would do the natural thing, i.e. flush
> all the writes that had been done *on that channel*.

That is an option, yes, but the natural result will be that you issue N
flush requests, rather than one, which I'm guessing will kill
performance. Therefore, I'd prefer not to go down that route.

[...]
> > It is indeed impossible for a server to know what has been received by
> > the client by the time it (the client) sent out the flush request.
> > However, the server doesn't need that information, at all. The flush
> > request's semantics do not say that any request not covered by the flush
> > request itself MUST NOT have hit disk; instead, it just says that there
> > is no guarantee on whether or not that is the case. That's fine; all a
> > server needs to know is that when it receives a flush, it needs to
> > fsync() or some such, and then send the reply. All a *client* needs to
> > know is which requests have most definitely hit the disk. In my
> > proposal, those are the requests that finished before the flush request
> > was sent, and not the requests that finished between that and when the
> > flush reply is received. Those are *likely* to also be covered
> > (especially on single-connection NBD setups), but in my proposal,
> > they're no longer *guaranteed* to be.
> 
> I think my objection was more that you were writing mandatory language
> for a server's behaviour based on what the client perceives.
> 
> What you are saying from the client's point of view is that it under
> your proposed change it can only rely on that writes in respect of
> which the reply has been received prior to issuing the flush are persisted
> to disk (more might be persisted, but the client can't rely on it).

Exactly.

[...]
> IE I don't actually think the wording from the server side needs changing
> now I see what you are trying to do. Just we need a new paragraph saying
> what the client can and cannot reply on.

That's obviously also a valid option. I'm looking forward to your
proposed wording then :-)

[...]
> >> I suppose that's fine in that we can at least shorten the CC: line,
> >> but I still think it would be helpful if the protocol
> > 
> > unfinished sentence here...
> 
> .... but I still think it would be helpful if the protocol helped out
> the end user of the client and refused to negotiate multichannel
> connections when they are unsafe. How is the end client meant to know
> whether the back end is not on Linux, not on a block device, done
> via a Ceph driver etc?

Well, it isn't. The server, if it provides certain functionality, should
also provide particular guarantees. If it can't provide those
guarantees, it should not provide that functionality.

e.g., if a server runs on a backend with cache coherency issues, it
should not allow multiple connections to the same device, etc.

> I still think it's pretty damn awkward that with a ceph back end
> (for instance) which would be one of the backends to benefit the
> most from multichannel connections (as it's inherently parallel),
> no one has explained how flush could be done safely.

If ceph doesn't have any way to guarantee that a write is available to
all readers of a particular device, then it *cannot* be used to map
block device semantics with multiple channels. Therefore, it should not
allow writing to the device from multiple clients, period, unless the
filesystem (or other thing) making use of the nbd device above the ceph
layer actually understands how things may go wrong and can take care of
it.

As such, I don't think that the problems inherent in using multiple
connections to a ceph device (which I do not deny) have any place in a
discussion on how NBD should work in the face of multiple channels with
a sane/regular backend.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 10:15                               ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 10:15 UTC (permalink / raw)
  To: Alex Bligh
  Cc: nbd-general, Christoph Hellwig, Josef Bacik, linux-kernel,
	Jens Axboe, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 10:41:36AM +0100, Alex Bligh wrote:
> Wouter,
[...]
> > Given that, given the issue in the previous
> > paragraph, and given the uncertainty introduced with multiple
> > connections, I think it is reasonable to say that a client should just
> > not assume a flush touches anything except for the writes for which it
> > has already received a reply by the time the flush request is sent out.
> 
> OK. So you are proposing weakening the semantic for flush (saying that
> it is only guaranteed to cover those writes for which the client has
> actually received a reply prior to sending the flush, as opposed to
> prior to receiving the flush reply). This is based on the view that
> the Linux kernel client wouldn't be affected, and if other clients
> were affected, their behaviour would be 'somewhat unusual'.

Right.

> We do have one significant other client out there that uses flush
> which is Qemu. I think we should get a view on whether they would be
> affected.

That's certainly something to consider, yes.

> > Those are semantics that are actually useful and can be guaranteed in
> > the face of multiple connections. Other semantics can not.
> 
> Well there is another semantic which would work just fine, and also
> cures the other problem (synchronisation between channels) which would
> be simply that flush is only guaranteed to affect writes issued on the
> same channel. Then flush would do the natural thing, i.e. flush
> all the writes that had been done *on that channel*.

That is an option, yes, but the natural result will be that you issue N
flush requests, rather than one, which I'm guessing will kill
performance. Therefore, I'd prefer not to go down that route.

[...]
> > It is indeed impossible for a server to know what has been received by
> > the client by the time it (the client) sent out the flush request.
> > However, the server doesn't need that information, at all. The flush
> > request's semantics do not say that any request not covered by the flush
> > request itself MUST NOT have hit disk; instead, it just says that there
> > is no guarantee on whether or not that is the case. That's fine; all a
> > server needs to know is that when it receives a flush, it needs to
> > fsync() or some such, and then send the reply. All a *client* needs to
> > know is which requests have most definitely hit the disk. In my
> > proposal, those are the requests that finished before the flush request
> > was sent, and not the requests that finished between that and when the
> > flush reply is received. Those are *likely* to also be covered
> > (especially on single-connection NBD setups), but in my proposal,
> > they're no longer *guaranteed* to be.
> 
> I think my objection was more that you were writing mandatory language
> for a server's behaviour based on what the client perceives.
> 
> What you are saying from the client's point of view is that it under
> your proposed change it can only rely on that writes in respect of
> which the reply has been received prior to issuing the flush are persisted
> to disk (more might be persisted, but the client can't rely on it).

Exactly.

[...]
> IE I don't actually think the wording from the server side needs changing
> now I see what you are trying to do. Just we need a new paragraph saying
> what the client can and cannot reply on.

That's obviously also a valid option. I'm looking forward to your
proposed wording then :-)

[...]
> >> I suppose that's fine in that we can at least shorten the CC: line,
> >> but I still think it would be helpful if the protocol
> > 
> > unfinished sentence here...
> 
> .... but I still think it would be helpful if the protocol helped out
> the end user of the client and refused to negotiate multichannel
> connections when they are unsafe. How is the end client meant to know
> whether the back end is not on Linux, not on a block device, done
> via a Ceph driver etc?

Well, it isn't. The server, if it provides certain functionality, should
also provide particular guarantees. If it can't provide those
guarantees, it should not provide that functionality.

e.g., if a server runs on a backend with cache coherency issues, it
should not allow multiple connections to the same device, etc.

> I still think it's pretty damn awkward that with a ceph back end
> (for instance) which would be one of the backends to benefit the
> most from multichannel connections (as it's inherently parallel),
> no one has explained how flush could be done safely.

If ceph doesn't have any way to guarantee that a write is available to
all readers of a particular device, then it *cannot* be used to map
block device semantics with multiple channels. Therefore, it should not
allow writing to the device from multiple clients, period, unless the
filesystem (or other thing) making use of the nbd device above the ceph
layer actually understands how things may go wrong and can take care of
it.

As such, I don't think that the problems inherent in using multiple
connections to a ceph device (which I do not deny) have any place in a
discussion on how NBD should work in the face of multiple channels with
a sane/regular backend.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06  9:04                           ` Wouter Verhelst
@ 2016-10-06 10:31                             ` Christoph Hellwig
  -1 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-06 10:31 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 11:04:15AM +0200, Wouter Verhelst wrote:
> In the current situation, a client could opportunistically send a number
> of write requests immediately followed by a flush and hope for the best.
> However, in that case there is no guarantee that for the write requests
> that the client actually cares about to have hit the disk, a reply
> arrives on the client side before the flush reply arrives. If that
> doesn't happen, that would then mean the client would have to issue
> another flush request, probably at a performance hit.

There is also no guarantee that the server would receive them in order.

Note that people looked into schemes like this multiple times using
a SCSI feature called ordered tags which should provide this sort
of ordering, but no one managed to make it work reliably.

> As I understand Christoph's explanations, currently the Linux kernel
> *doesn't* issue flush requests unless and until the necessary writes
> have already completed (i.e., the reply has been received and processed
> on the client side). Given that, given the issue in the previous
> paragraph, and given the uncertainty introduced with multiple
> connections, I think it is reasonable to say that a client should just
> not assume a flush touches anything except for the writes for which it
> has already received a reply by the time the flush request is sent out.

Exactly.  That's the wording in other protocol specifications, and the
semantics Linux (and Windows) rely on.

> Christoph: just to double-check: would such semantics be incompatible
> with the semantics that the Linux kernel expects of block devices? If
> so, we'll have to review. Otherwise, I think we should go with that.

No, they match the cache flush semantics in every other storage protocol
known to me, and they match the expectations of both the Linux kernel
and any other OS or comsumer I know about perfectly.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 10:31                             ` Christoph Hellwig
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-06 10:31 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Jens Axboe, Josef Bacik, linux-kernel,
	Christoph Hellwig, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 11:04:15AM +0200, Wouter Verhelst wrote:
> In the current situation, a client could opportunistically send a number
> of write requests immediately followed by a flush and hope for the best.
> However, in that case there is no guarantee that for the write requests
> that the client actually cares about to have hit the disk, a reply
> arrives on the client side before the flush reply arrives. If that
> doesn't happen, that would then mean the client would have to issue
> another flush request, probably at a performance hit.

There is also no guarantee that the server would receive them in order.

Note that people looked into schemes like this multiple times using
a SCSI feature called ordered tags which should provide this sort
of ordering, but no one managed to make it work reliably.

> As I understand Christoph's explanations, currently the Linux kernel
> *doesn't* issue flush requests unless and until the necessary writes
> have already completed (i.e., the reply has been received and processed
> on the client side). Given that, given the issue in the previous
> paragraph, and given the uncertainty introduced with multiple
> connections, I think it is reasonable to say that a client should just
> not assume a flush touches anything except for the writes for which it
> has already received a reply by the time the flush request is sent out.

Exactly.  That's the wording in other protocol specifications, and the
semantics Linux (and Windows) rely on.

> Christoph: just to double-check: would such semantics be incompatible
> with the semantics that the Linux kernel expects of block devices? If
> so, we'll have to review. Otherwise, I think we should go with that.

No, they match the cache flush semantics in every other storage protocol
known to me, and they match the expectations of both the Linux kernel
and any other OS or comsumer I know about perfectly.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06 10:15                               ` Wouter Verhelst
@ 2016-10-06 11:04                                 ` Alex Bligh
  -1 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-06 11:04 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Christoph Hellwig, Josef Bacik,
	linux-kernel, Jens Axboe, linux-block, Kernel Team


> On 6 Oct 2016, at 11:15, Wouter Verhelst <w@uter.be> wrote:
> 
>> 
>> 
>> .... but I still think it would be helpful if the protocol helped out
>> the end user of the client and refused to negotiate multichannel
>> connections when they are unsafe. How is the end client meant to know
>> whether the back end is not on Linux, not on a block device, done
>> via a Ceph driver etc?
> 
> Well, it isn't. The server, if it provides certain functionality, should
> also provide particular guarantees. If it can't provide those
> guarantees, it should not provide that functionality.
> 
> e.g., if a server runs on a backend with cache coherency issues, it
> should not allow multiple connections to the same device, etc.

Sure. I'm simply saying that the connection flags should say "I can't
support multiple connections to this device" (available at
NBD_OPT_INFO time) rather than errorring out. This is a userspace
protocol issue.

>> I still think it's pretty damn awkward that with a ceph back end
>> (for instance) which would be one of the backends to benefit the
>> most from multichannel connections (as it's inherently parallel),
>> no one has explained how flush could be done safely.
> 
> If ceph doesn't have any way to guarantee that a write is available to
> all readers of a particular device, then it *cannot* be used to map
> block device semantics with multiple channels.

Thinking about it I believe Ceph actually may be able to do that,
it's just harder than a straightforward flush.

> Therefore, it should not
> allow writing to the device from multiple clients, period, unless the
> filesystem (or other thing) making use of the nbd device above the ceph
> layer actually understands how things may go wrong and can take care of
> it.
> 
> As such, I don't think that the problems inherent in using multiple
> connections to a ceph device (which I do not deny) have any place in a
> discussion on how NBD should work in the face of multiple channels with
> a sane/regular backend.

On which note, I am still not convinced that fsync() provides such
semantics on all operating systems and on Linux on non-block devices.
I'm not sure all those backends are 'insane'! However, if the server
could signal lack of support for multiple connections (see above)
my concerns would be significantly reduced. Note his requires no
kernel change (as you pointed out).

-- 
Alex Bligh





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 11:04                                 ` Alex Bligh
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Bligh @ 2016-10-06 11:04 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Alex Bligh, nbd-general, Christoph Hellwig, Josef Bacik,
	linux-kernel, Jens Axboe, linux-block, Kernel Team


> On 6 Oct 2016, at 11:15, Wouter Verhelst <w@uter.be> wrote:
> 
>> 
>> 
>> .... but I still think it would be helpful if the protocol helped out
>> the end user of the client and refused to negotiate multichannel
>> connections when they are unsafe. How is the end client meant to know
>> whether the back end is not on Linux, not on a block device, done
>> via a Ceph driver etc?
> 
> Well, it isn't. The server, if it provides certain functionality, should
> also provide particular guarantees. If it can't provide those
> guarantees, it should not provide that functionality.
> 
> e.g., if a server runs on a backend with cache coherency issues, it
> should not allow multiple connections to the same device, etc.

Sure. I'm simply saying that the connection flags should say "I can't
support multiple connections to this device" (available at
NBD_OPT_INFO time) rather than errorring out. This is a userspace
protocol issue.

>> I still think it's pretty damn awkward that with a ceph back end
>> (for instance) which would be one of the backends to benefit the
>> most from multichannel connections (as it's inherently parallel),
>> no one has explained how flush could be done safely.
> 
> If ceph doesn't have any way to guarantee that a write is available to
> all readers of a particular device, then it *cannot* be used to map
> block device semantics with multiple channels.

Thinking about it I believe Ceph actually may be able to do that,
it's just harder than a straightforward flush.

> Therefore, it should not
> allow writing to the device from multiple clients, period, unless the
> filesystem (or other thing) making use of the nbd device above the ceph
> layer actually understands how things may go wrong and can take care of
> it.
> 
> As such, I don't think that the problems inherent in using multiple
> connections to a ceph device (which I do not deny) have any place in a
> discussion on how NBD should work in the face of multiple channels with
> a sane/regular backend.

On which note, I am still not convinced that fsync() provides such
semantics on all operating systems and on Linux on non-block devices.
I'm not sure all those backends are 'insane'! However, if the server
could signal lack of support for multiple connections (see above)
my concerns would be significantly reduced. Note his requires no
kernel change (as you pointed out).

-- 
Alex Bligh

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06 10:31                             ` Christoph Hellwig
@ 2016-10-06 13:09                               ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 13:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, Josef Bacik, linux-kernel, Jens Axboe, linux-block,
	Kernel Team

On Thu, Oct 06, 2016 at 03:31:55AM -0700, Christoph Hellwig wrote:
> No, they match the cache flush semantics in every other storage protocol
> known to me, and they match the expectations of both the Linux kernel
> and any other OS or comsumer I know about perfectly.

Okay, I've updated the proto.md file then, to clarify that in the case
of multiple connections, a client MUST NOT send a flush request until it
has seen the replies to the write requests that it cares about. That
should be enough for now.

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 13:09                               ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 13:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, Josef Bacik, linux-kernel, Jens Axboe, linux-block,
	Kernel Team

On Thu, Oct 06, 2016 at 03:31:55AM -0700, Christoph Hellwig wrote:
> No, they match the cache flush semantics in every other storage protocol
> known to me, and they match the expectations of both the Linux kernel
> and any other OS or comsumer I know about perfectly.

Okay, I've updated the proto.md file then, to clarify that in the case
of multiple connections, a client MUST NOT send a flush request until it
has seen the replies to the write requests that it cares about. That
should be enough for now.

Thanks,

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06 13:09                               ` Wouter Verhelst
@ 2016-10-06 13:16                                 ` Christoph Hellwig
  -1 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-06 13:16 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, Josef Bacik, linux-kernel,
	Jens Axboe, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 03:09:49PM +0200, Wouter Verhelst wrote:
> Okay, I've updated the proto.md file then, to clarify that in the case
> of multiple connections, a client MUST NOT send a flush request until it
> has seen the replies to the write requests that it cares about. That
> should be enough for now.

How do you guarantee that nothing has been reordered or even lost even for
a single connection?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 13:16                                 ` Christoph Hellwig
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Hellwig @ 2016-10-06 13:16 UTC (permalink / raw)
  To: Wouter Verhelst
  Cc: Christoph Hellwig, nbd-general, Josef Bacik, linux-kernel,
	Jens Axboe, linux-block, Kernel Team

On Thu, Oct 06, 2016 at 03:09:49PM +0200, Wouter Verhelst wrote:
> Okay, I've updated the proto.md file then, to clarify that in the case
> of multiple connections, a client MUST NOT send a flush request until it
> has seen the replies to the write requests that it cares about. That
> should be enough for now.

How do you guarantee that nothing has been reordered or even lost even for
a single connection?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
  2016-10-06 13:16                                 ` Christoph Hellwig
@ 2016-10-06 13:55                                   ` Wouter Verhelst
  -1 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 13:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, linux-block, Jens Axboe, linux-kernel, Josef Bacik,
	Kernel Team

On Thu, Oct 06, 2016 at 06:16:30AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 06, 2016 at 03:09:49PM +0200, Wouter Verhelst wrote:
> > Okay, I've updated the proto.md file then, to clarify that in the case
> > of multiple connections, a client MUST NOT send a flush request until it
> > has seen the replies to the write requests that it cares about. That
> > should be enough for now.
> 
> How do you guarantee that nothing has been reordered or even lost even for
> a single connection?

In the case of a single connection, we already stated that the flush
covers the write requests for which a reply has already been sent out by
the time the flush reply is sent out. On a single connection, there is
no way an implementation can comply with the old requirement but not the
new one.

We do not guarantee any ordering beyond that; and lost requests would be
a bug in the server.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Nbd] [PATCH][V3] nbd: add multi-connection support
@ 2016-10-06 13:55                                   ` Wouter Verhelst
  0 siblings, 0 replies; 44+ messages in thread
From: Wouter Verhelst @ 2016-10-06 13:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: nbd-general, linux-block, Jens Axboe, linux-kernel, Josef Bacik,
	Kernel Team

On Thu, Oct 06, 2016 at 06:16:30AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 06, 2016 at 03:09:49PM +0200, Wouter Verhelst wrote:
> > Okay, I've updated the proto.md file then, to clarify that in the case
> > of multiple connections, a client MUST NOT send a flush request until it
> > has seen the replies to the write requests that it cares about. That
> > should be enough for now.
> 
> How do you guarantee that nothing has been reordered or even lost even for
> a single connection?

In the case of a single connection, we already stated that the flush
covers the write requests for which a reply has already been sent out by
the time the flush reply is sent out. On a single connection, there is
no way an implementation can comply with the old requirement but not the
new one.

We do not guarantee any ordering beyond that; and lost requests would be
a bug in the server.

-- 
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
       people in the world who think they really understand all of its rules,
       and pretty much all of them are just lying to themselves too.
 -- #debian-devel, OFTC, 2016-02-12

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][V3] nbd: add multi-connection support
  2016-09-28 20:01 [PATCH][V3] nbd: add multi-connection support Josef Bacik
  2016-09-29  9:52 ` [Nbd] " Wouter Verhelst
@ 2016-10-11  9:00 ` Sagi Grimberg
  1 sibling, 0 replies; 44+ messages in thread
From: Sagi Grimberg @ 2016-10-11  9:00 UTC (permalink / raw)
  To: Josef Bacik, axboe, linux-block, linux-kernel, kernel-team,
	nbd-general, w


> NBD can become contended on its single connection.  We have to serialize all
> writes and we can only process one read response at a time.  Fix this by
> allowing userspace to provide multiple connections to a single nbd device.  This
> coupled with block-mq drastically increases performance in multi-process cases.
> Thanks,

Hey Josef,

I gave this patch a tryout and I'm getting a kernel paging request when
running multi-threaded write workload [1].

I have 2 VMs on my laptop: each is assigned with 2 cpus. I connected
the client to the server via 2 connections and ran:
fio --group_reporting --rw=randwrite --bs=4k --numjobs=2 --iodepth=128 
--runtime=60 --time_based --loops=1 --ioengine=libaio --direct=1 
--invalidate=1 --randrepeat=1 --norandommap --exitall --name task_nbd0 
--filename=/dev/nbd0

The server backend is null_blk btw:
./nbd-server 1022 /dev/nullb0

nbd-client:
./nbd-client -C 2 192.168.100.3 1022 /dev/nbd0

[1]:
[  171.813649] BUG: unable to handle kernel paging request at 
0000000235363130
[  171.816015] IP: [<ffffffffc0645e39>] nbd_queue_rq+0x319/0x580 [nbd]
[  171.816015] PGD 7a080067 PUD 0
[  171.816015] Oops: 0000 [#1] SMP
[  171.816015] Modules linked in: nbd(O) rpcsec_gss_krb5 nfsv4 ib_iser 
iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi 
snd_hda_codec_generic ppdev kvm_intel cirrus snd_hda_intel ttm kvm 
irqbypass drm_kms_helper snd_hda_codec drm snd_hda_core snd_hwdep joydev 
input_leds fb_sys_fops snd_pcm serio_raw syscopyarea snd_timer 
sysfillrect snd sysimgblt soundcore i2c_piix4 nfsd ib_umad parport_pc 
auth_rpcgss nfs_acl rdma_ucm nfs rdma_cm iw_cm lockd grace ib_cm 
configfs sunrpc ib_uverbs mac_hid fscache ib_core lp parport psmouse 
floppy e1000 pata_acpi [last unloaded: nbd]
[  171.816015] CPU: 0 PID: 196 Comm: kworker/0:1H Tainted: G           O 
    4.8.0-rc4+ #61
[  171.816015] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS Bochs 01/01/2011
[  171.816015] Workqueue: kblockd blk_mq_run_work_fn
[  171.816015] task: ffff8f0b37b23280 task.stack: ffff8f0b37bf0000
[  171.816015] RIP: 0010:[<ffffffffc0645e39>]  [<ffffffffc0645e39>] 
nbd_queue_rq+0x319/0x580 [nbd]
[  171.816015] RSP: 0018:ffff8f0b37bf3c20  EFLAGS: 00010206
[  171.816015] RAX: 0000000235363130 RBX: 0000000000000000 RCX: 
0000000000000200
[  171.816015] RDX: 0000000000000200 RSI: ffff8f0b37b23b48 RDI: 
ffff8f0b37b23280
[  171.816015] RBP: ffff8f0b37bf3cc8 R08: 0000000000000001 R09: 
0000000000000000
[  171.816015] R10: 0000000000000000 R11: ffff8f0b37f21000 R12: 
0000000023536303
[  171.816015] R13: 0000000000000000 R14: 0000000023536313 R15: 
ffff8f0b37f21000
[  171.816015] FS:  0000000000000000(0000) GS:ffff8f0b3d200000(0000) 
knlGS:0000000000000000
[  171.816015] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  171.816015] CR2: 0000000235363130 CR3: 00000000789b7000 CR4: 
00000000000006f0
[  171.816015] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  171.816015] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  171.816015] Stack:
[  171.816015]  ffff8f0b00000000 ffff8f0b37a79480 ffff8f0b378513c8 
0000000000000282
[  171.816015]  ffff8f0b37b28428 ffff8f0b37a795f0 ffff8f0b37f21500 
00000a0023536313
[  171.816015]  ffffea0001c69080 0000000000000000 ffff8f0b37b28280 
1395602537b23280
[  171.816015] Call Trace:
[  171.816015]  [<ffffffffb8426840>] __blk_mq_run_hw_queue+0x260/0x390
[  171.816015]  [<ffffffffb84269b2>] blk_mq_run_work_fn+0x12/0x20
[  171.816015]  [<ffffffffb80aae21>] process_one_work+0x1f1/0x6b0
[  171.816015]  [<ffffffffb80aada2>] ? process_one_work+0x172/0x6b0
[  171.816015]  [<ffffffffb80ab32e>] worker_thread+0x4e/0x490
[  171.816015]  [<ffffffffb80ab2e0>] ? process_one_work+0x6b0/0x6b0
[  171.816015]  [<ffffffffb80ab2e0>] ? process_one_work+0x6b0/0x6b0
[  171.816015]  [<ffffffffb80b1f41>] kthread+0x101/0x120
[  171.816015]  [<ffffffffb88d4ecf>] ret_from_fork+0x1f/0x40
[  171.816015]  [<ffffffffb80b1e40>] ? kthread_create_on_node+0x250/0x250

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2016-10-11  9:00 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-28 20:01 [PATCH][V3] nbd: add multi-connection support Josef Bacik
2016-09-29  9:52 ` [Nbd] " Wouter Verhelst
2016-09-29 14:03   ` Josef Bacik
2016-09-29 16:41     ` Wouter Verhelst
2016-09-29 16:59       ` Josef Bacik
2016-10-02 16:17         ` Alex Bligh
2016-10-02 16:17           ` Alex Bligh
2016-10-03  1:47           ` Josef Bacik
2016-10-03  1:47             ` Josef Bacik
2016-10-03  7:20             ` Christoph Hellwig
2016-10-03  7:20               ` Christoph Hellwig
2016-10-03  7:51               ` Wouter Verhelst
2016-10-03  7:51                 ` Wouter Verhelst
2016-10-03  7:57                 ` Christoph Hellwig
2016-10-03  7:57                   ` Christoph Hellwig
2016-10-03 11:34                   ` Alex Bligh
2016-10-03 11:34                     ` Alex Bligh
2016-10-03 14:32                     ` Josef Bacik
2016-10-03 14:32                       ` Josef Bacik
2016-10-03 14:46                       ` Alex Bligh
2016-10-03 14:46                         ` Alex Bligh
2016-10-03 21:07                     ` Wouter Verhelst
2016-10-03 21:07                       ` Wouter Verhelst
2016-10-04  9:35                       ` Alex Bligh
2016-10-04  9:35                         ` Alex Bligh
2016-10-06  9:04                         ` Wouter Verhelst
2016-10-06  9:04                           ` Wouter Verhelst
2016-10-06  9:41                           ` Alex Bligh
2016-10-06  9:41                             ` Alex Bligh
2016-10-06 10:15                             ` Wouter Verhelst
2016-10-06 10:15                               ` Wouter Verhelst
2016-10-06 11:04                               ` Alex Bligh
2016-10-06 11:04                                 ` Alex Bligh
2016-10-06 10:31                           ` Christoph Hellwig
2016-10-06 10:31                             ` Christoph Hellwig
2016-10-06 13:09                             ` Wouter Verhelst
2016-10-06 13:09                               ` Wouter Verhelst
2016-10-06 13:16                               ` Christoph Hellwig
2016-10-06 13:16                                 ` Christoph Hellwig
2016-10-06 13:55                                 ` Wouter Verhelst
2016-10-06 13:55                                   ` Wouter Verhelst
2016-10-03  7:49           ` Wouter Verhelst
2016-10-03  7:49             ` Wouter Verhelst
2016-10-11  9:00 ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.