All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
@ 2018-12-12  9:31 jiangyiwen
  2018-12-12 15:31 ` Michael S. Tsirkin
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-12  9:31 UTC (permalink / raw)
  To: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang
  Cc: netdev, kvm, virtualization

Guest receive mergeable rx buffer, it can merge
scatter rx buffer into a big buffer and then copy
to user space.

In addition, it also use iovec to replace buf in struct
virtio_vsock_pkt, keep tx and rx consistency. The only
difference is now tx still uses a segment of continuous
physical memory to implement.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 drivers/vhost/vsock.c                   |  31 +++++++---
 include/linux/virtio_vsock.h            |   6 +-
 net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
 net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
 4 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index dc52b0f..c7ab0dd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		size_t nbytes;
 		size_t len;
 		s16 headcount;
+		size_t remain_len;
+		int i;

 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			break;
 		}

-		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
-		if (nbytes != pkt->len) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Faulted on copying pkt buf\n");
-			break;
+		remain_len = pkt->len;
+		for (i = 0; i < pkt->nr_vecs; i++) {
+			int tmp_len;
+
+			tmp_len = min(remain_len, pkt->vec[i].iov_len);
+			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
+			if (nbytes != tmp_len) {
+				virtio_transport_free_pkt(pkt);
+				vq_err(vq, "Faulted on copying pkt buf\n");
+				break;
+			}
+
+			remain_len -= tmp_len;
 		}

 		vhost_add_used_n(vq, vq->heads, headcount);
@@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
 	struct iov_iter iov_iter;
 	size_t nbytes;
 	size_t len;
+	void *buf;

 	if (in != 0) {
 		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
@@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
 		return NULL;
 	}

-	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
-	if (!pkt->buf) {
+	buf = kmalloc(pkt->len, GFP_KERNEL);
+	if (!buf) {
 		kfree(pkt);
 		return NULL;
 	}

-	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+	pkt->vec[0].iov_base = buf;
+	pkt->vec[0].iov_len = pkt->len;
+	pkt->nr_vecs = 1;
+
+	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
 	if (nbytes != pkt->len) {
 		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
 		       pkt->len, nbytes);
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index da9e1fe..734eeed 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -13,6 +13,8 @@
 #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
 #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
+#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)

 /* Virtio-vsock feature */
 #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
@@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
 	struct list_head list;
 	/* socket refcnt not held, only use for cancellation */
 	struct vsock_sock *vsk;
-	void *buf;
+	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
+	int nr_vecs;
 	u32 len;
 	u32 off;
 	bool reply;
+	bool mergeable;
 };

 struct virtio_vsock_pkt_info {
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index c4a465c..148b58a 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,

 		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
 		sgs[out_sg++] = &hdr;
-		if (pkt->buf) {
-			sg_init_one(&buf, pkt->buf, pkt->len);
+		if (pkt->len) {
+			/* Currently only support a segment of memory in tx */
+			BUG_ON(pkt->vec[0].iov_len != pkt->len);
+			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
 			sgs[out_sg++] = &buf;
 		}

@@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
 	struct virtio_vsock_pkt *pkt;
 	struct scatterlist hdr, buf, *sgs[2];
 	int ret;
+	void *pkt_buf;

 	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
 	if (!pkt)
 		return -ENOMEM;

-	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
-	if (!pkt->buf) {
+	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
+	if (!pkt_buf) {
 		virtio_transport_free_pkt(pkt);
 		return -ENOMEM;
 	}

+	pkt->vec[0].iov_base = pkt_buf;
+	pkt->vec[0].iov_len = buf_len;
+	pkt->nr_vecs = 1;
+
 	pkt->len = buf_len;

 	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
 	sgs[0] = &hdr;

-	sg_init_one(&buf, pkt->buf, buf_len);
+	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
 	sgs[1] = &buf;
 	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
 	if (ret)
@@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
 	return val < virtqueue_get_vring_size(vq);
 }

+static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
+		struct virtio_vsock *vsock, unsigned int *total_len)
+{
+	struct virtio_vsock_pkt *pkt;
+	u16 num_buf;
+	void *buf;
+	unsigned int len;
+	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
+
+	buf = virtqueue_get_buf(vq, &len);
+	if (!buf)
+		return NULL;
+
+	*total_len = len;
+	vsock->rx_buf_nr--;
+
+	if (unlikely(len < vsock_hlen)) {
+		put_page(virt_to_head_page(buf));
+		return NULL;
+	}
+
+	pkt = buf;
+	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
+	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
+		put_page(virt_to_head_page(buf));
+		return NULL;
+	}
+
+	/* Initialize pkt residual structure */
+	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
+			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
+
+	pkt->mergeable = true;
+	pkt->len = le32_to_cpu(pkt->hdr.len);
+	if (!pkt->len)
+		return pkt;
+
+	len -= vsock_hlen;
+	if (len) {
+		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
+		pkt->vec[pkt->nr_vecs].iov_len = len;
+		/* Shared page with pkt, so get page in advance */
+		get_page(virt_to_head_page(buf));
+		pkt->nr_vecs++;
+	}
+
+	while (--num_buf) {
+		buf = virtqueue_get_buf(vq, &len);
+		if (!buf)
+			goto err;
+
+		*total_len += len;
+		vsock->rx_buf_nr--;
+
+		pkt->vec[pkt->nr_vecs].iov_base = buf;
+		pkt->vec[pkt->nr_vecs].iov_len = len;
+		pkt->nr_vecs++;
+	}
+
+	return pkt;
+err:
+	virtio_transport_free_pkt(pkt);
+	return NULL;
+}
+
 static void virtio_transport_rx_work(struct work_struct *work)
 {
 	struct virtio_vsock *vsock =
 		container_of(work, struct virtio_vsock, rx_work);
 	struct virtqueue *vq;
+	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
+			sizeof(struct virtio_vsock_hdr);

 	vq = vsock->vqs[VSOCK_VQ_RX];

@@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
 				goto out;
 			}

-			pkt = virtqueue_get_buf(vq, &len);
-			if (!pkt) {
-				break;
-			}
+			if (likely(vsock->mergeable)) {
+				pkt = receive_mergeable(vq, vsock, &len);
+				if (!pkt)
+					break;
+			} else {
+				pkt = virtqueue_get_buf(vq, &len);
+				if (!pkt)
+					break;

-			vsock->rx_buf_nr--;
+				vsock->rx_buf_nr--;
+			}

 			/* Drop short/long packets */
-			if (unlikely(len < sizeof(pkt->hdr) ||
-				     len > sizeof(pkt->hdr) + pkt->len)) {
+			if (unlikely(len < vsock_hlen ||
+				     len > vsock_hlen + pkt->len)) {
 				virtio_transport_free_pkt(pkt);
 				continue;
 			}

-			pkt->len = len - sizeof(pkt->hdr);
+			pkt->len = len - vsock_hlen;
 			virtio_transport_deliver_tap_pkt(pkt);
 			virtio_transport_recv_pkt(pkt);
 		}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3ae3a33..123a8b6 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 {
 	struct virtio_vsock_pkt *pkt;
 	int err;
+	void *buf = NULL;

 	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
 	if (!pkt)
@@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 	pkt->vsk		= info->vsk;

 	if (info->msg && len > 0) {
-		pkt->buf = kmalloc(len, GFP_KERNEL);
-		if (!pkt->buf)
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf)
 			goto out_pkt;
-		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		err = memcpy_from_msg(buf, info->msg, len);
 		if (err)
 			goto out;
+
+		pkt->vec[0].iov_base = buf;
+		pkt->vec[0].iov_len = len;
+		pkt->nr_vecs = 1;
 	}

 	trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 	return pkt;

 out:
-	kfree(pkt->buf);
+	kfree(buf);
 out_pkt:
 	kfree(pkt);
 	return NULL;
@@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
 	struct virtio_vsock_pkt *pkt = opaque;
 	struct af_vsockmon_hdr *hdr;
 	struct sk_buff *skb;
+	int i;

 	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
 			GFP_ATOMIC);
@@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
 	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));

 	if (pkt->len) {
-		skb_put_data(skb, pkt->buf, pkt->len);
+		for (i = 0; i < pkt->nr_vecs; i++)
+			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
 	}

 	return skb;
@@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,

 	spin_lock_bh(&vvs->rx_lock);
 	while (total < len && !list_empty(&vvs->rx_queue)) {
+		size_t copy_bytes, last_vec_total = 0, vec_off;
+		int i;
+
 		pkt = list_first_entry(&vvs->rx_queue,
 				       struct virtio_vsock_pkt, list);

@@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
 		 */
 		spin_unlock_bh(&vvs->rx_lock);

-		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
-		if (err)
-			goto out;
+		for (i = 0; i < pkt->nr_vecs; i++) {
+			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
+				last_vec_total += pkt->vec[i].iov_len;
+				continue;
+			}
+
+			vec_off = pkt->off - last_vec_total;
+			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
+			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
+					copy_bytes);
+			if (err)
+				goto out;
+
+			bytes -= copy_bytes;
+			pkt->off += copy_bytes;
+			total += copy_bytes;
+			last_vec_total += pkt->vec[i].iov_len;
+			if (!bytes)
+				break;
+		}

 		spin_lock_bh(&vvs->rx_lock);
-
-		total += bytes;
-		pkt->off += bytes;
 		if (pkt->off == pkt->len) {
 			virtio_transport_dec_rx_pkt(vvs, pkt);
 			list_del(&pkt->list);
@@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)

 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
 {
-	kfree(pkt->buf);
-	kfree(pkt);
+	int i;
+
+	if (pkt->mergeable) {
+		for (i = 0; i < pkt->nr_vecs; i++)
+			put_page(virt_to_head_page(pkt->vec[i].iov_base));
+		put_page(virt_to_head_page((void *)pkt));
+	} else {
+		for (i = 0; i < pkt->nr_vecs; i++)
+			kfree(pkt->vec[i].iov_base);
+		kfree(pkt);
+	}
 }
 EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);

-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
@ 2018-12-12 15:31 ` Michael S. Tsirkin
  2018-12-13  2:38   ` jiangyiwen
  2018-12-12 15:31 ` Michael S. Tsirkin
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2018-12-12 15:31 UTC (permalink / raw)
  To: jiangyiwen; +Cc: Stefan Hajnoczi, Jason Wang, netdev, kvm, virtualization

On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> Guest receive mergeable rx buffer, it can merge
> scatter rx buffer into a big buffer and then copy
> to user space.
> 
> In addition, it also use iovec to replace buf in struct
> virtio_vsock_pkt, keep tx and rx consistency. The only
> difference is now tx still uses a segment of continuous
> physical memory to implement.
> 
> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> ---
>  drivers/vhost/vsock.c                   |  31 +++++++---
>  include/linux/virtio_vsock.h            |   6 +-
>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>  4 files changed, 166 insertions(+), 35 deletions(-)


This was supposed to be a guest patch, why is vhost changed here?

> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index dc52b0f..c7ab0dd 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  		size_t nbytes;
>  		size_t len;
>  		s16 headcount;
> +		size_t remain_len;
> +		int i;
> 
>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>  		if (list_empty(&vsock->send_pkt_list)) {
> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  			break;
>  		}
> 
> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> -		if (nbytes != pkt->len) {
> -			virtio_transport_free_pkt(pkt);
> -			vq_err(vq, "Faulted on copying pkt buf\n");
> -			break;
> +		remain_len = pkt->len;
> +		for (i = 0; i < pkt->nr_vecs; i++) {
> +			int tmp_len;
> +
> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
> +			if (nbytes != tmp_len) {
> +				virtio_transport_free_pkt(pkt);
> +				vq_err(vq, "Faulted on copying pkt buf\n");
> +				break;
> +			}
> +
> +			remain_len -= tmp_len;
>  		}
> 
>  		vhost_add_used_n(vq, vq->heads, headcount);
> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>  	struct iov_iter iov_iter;
>  	size_t nbytes;
>  	size_t len;
> +	void *buf;
> 
>  	if (in != 0) {
>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>  		return NULL;
>  	}
> 
> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> -	if (!pkt->buf) {
> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> +	if (!buf) {
>  		kfree(pkt);
>  		return NULL;
>  	}
> 
> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> +	pkt->vec[0].iov_base = buf;
> +	pkt->vec[0].iov_len = pkt->len;
> +	pkt->nr_vecs = 1;
> +
> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>  	if (nbytes != pkt->len) {
>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>  		       pkt->len, nbytes);
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index da9e1fe..734eeed 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -13,6 +13,8 @@
>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
> 
>  /* Virtio-vsock feature */
>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>  	struct list_head list;
>  	/* socket refcnt not held, only use for cancellation */
>  	struct vsock_sock *vsk;
> -	void *buf;
> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> +	int nr_vecs;
>  	u32 len;
>  	u32 off;
>  	bool reply;
> +	bool mergeable;
>  };
> 
>  struct virtio_vsock_pkt_info {
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index c4a465c..148b58a 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> 
>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>  		sgs[out_sg++] = &hdr;
> -		if (pkt->buf) {
> -			sg_init_one(&buf, pkt->buf, pkt->len);
> +		if (pkt->len) {
> +			/* Currently only support a segment of memory in tx */
> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>  			sgs[out_sg++] = &buf;
>  		}
> 
> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>  	struct virtio_vsock_pkt *pkt;
>  	struct scatterlist hdr, buf, *sgs[2];
>  	int ret;
> +	void *pkt_buf;
> 
>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>  	if (!pkt)
>  		return -ENOMEM;
> 
> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> -	if (!pkt->buf) {
> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> +	if (!pkt_buf) {
>  		virtio_transport_free_pkt(pkt);
>  		return -ENOMEM;
>  	}
> 
> +	pkt->vec[0].iov_base = pkt_buf;
> +	pkt->vec[0].iov_len = buf_len;
> +	pkt->nr_vecs = 1;
> +
>  	pkt->len = buf_len;
> 
>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>  	sgs[0] = &hdr;
> 
> -	sg_init_one(&buf, pkt->buf, buf_len);
> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>  	sgs[1] = &buf;
>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>  	if (ret)
> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>  	return val < virtqueue_get_vring_size(vq);
>  }
> 
> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> +		struct virtio_vsock *vsock, unsigned int *total_len)
> +{
> +	struct virtio_vsock_pkt *pkt;
> +	u16 num_buf;
> +	void *buf;
> +	unsigned int len;
> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> +
> +	buf = virtqueue_get_buf(vq, &len);
> +	if (!buf)
> +		return NULL;
> +
> +	*total_len = len;
> +	vsock->rx_buf_nr--;
> +
> +	if (unlikely(len < vsock_hlen)) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	pkt = buf;
> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}

So everything just stops going, and host and user don't even
know what the reason is. And not only that - the next
packet will be corrupted because we skipped the first one.



> +
> +	/* Initialize pkt residual structure */
> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> +
> +	pkt->mergeable = true;
> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> +	if (!pkt->len)
> +		return pkt;
> +
> +	len -= vsock_hlen;
> +	if (len) {
> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> +		/* Shared page with pkt, so get page in advance */
> +		get_page(virt_to_head_page(buf));
> +		pkt->nr_vecs++;
> +	}
> +
> +	while (--num_buf) {
> +		buf = virtqueue_get_buf(vq, &len);
> +		if (!buf)
> +			goto err;
> +
> +		*total_len += len;
> +		vsock->rx_buf_nr--;
> +
> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> +		pkt->nr_vecs++;
> +	}
> +
> +	return pkt;
> +err:
> +	virtio_transport_free_pkt(pkt);
> +	return NULL;
> +}
> +
>  static void virtio_transport_rx_work(struct work_struct *work)
>  {
>  	struct virtio_vsock *vsock =
>  		container_of(work, struct virtio_vsock, rx_work);
>  	struct virtqueue *vq;
> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> +			sizeof(struct virtio_vsock_hdr);
> 
>  	vq = vsock->vqs[VSOCK_VQ_RX];
> 
> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>  				goto out;
>  			}
> 
> -			pkt = virtqueue_get_buf(vq, &len);
> -			if (!pkt) {
> -				break;
> -			}
> +			if (likely(vsock->mergeable)) {
> +				pkt = receive_mergeable(vq, vsock, &len);
> +				if (!pkt)
> +					break;
> +			} else {
> +				pkt = virtqueue_get_buf(vq, &len);
> +				if (!pkt)
> +					break;
> 

So looking at it, this seems to be the main source of the gain.
But why does this require host/guest changes?


The way I see it:
	- get a buffer and create an skb
	- get the next one, check header matches, if yes
	  tack it on the skb as a fragment. If not then
	  don't, deliver previous one and queue the new one.



> -			vsock->rx_buf_nr--;
> +				vsock->rx_buf_nr--;
> +			}
> 
>  			/* Drop short/long packets */
> -			if (unlikely(len < sizeof(pkt->hdr) ||
> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> +			if (unlikely(len < vsock_hlen ||
> +				     len > vsock_hlen + pkt->len)) {
>  				virtio_transport_free_pkt(pkt);
>  				continue;
>  			}
> 
> -			pkt->len = len - sizeof(pkt->hdr);
> +			pkt->len = len - vsock_hlen;
>  			virtio_transport_deliver_tap_pkt(pkt);
>  			virtio_transport_recv_pkt(pkt);
>  		}
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 3ae3a33..123a8b6 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  {
>  	struct virtio_vsock_pkt *pkt;
>  	int err;
> +	void *buf = NULL;
> 
>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>  	if (!pkt)
> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  	pkt->vsk		= info->vsk;
> 
>  	if (info->msg && len > 0) {
> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> -		if (!pkt->buf)
> +		buf = kmalloc(len, GFP_KERNEL);
> +		if (!buf)
>  			goto out_pkt;
> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> +		err = memcpy_from_msg(buf, info->msg, len);
>  		if (err)
>  			goto out;
> +
> +		pkt->vec[0].iov_base = buf;
> +		pkt->vec[0].iov_len = len;
> +		pkt->nr_vecs = 1;
>  	}
> 
>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  	return pkt;
> 
>  out:
> -	kfree(pkt->buf);
> +	kfree(buf);
>  out_pkt:
>  	kfree(pkt);
>  	return NULL;
> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	struct virtio_vsock_pkt *pkt = opaque;
>  	struct af_vsockmon_hdr *hdr;
>  	struct sk_buff *skb;
> +	int i;
> 
>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>  			GFP_ATOMIC);
> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> 
>  	if (pkt->len) {
> -		skb_put_data(skb, pkt->buf, pkt->len);
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>  	}
> 
>  	return skb;
> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> 
>  	spin_lock_bh(&vvs->rx_lock);
>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> +		int i;
> +
>  		pkt = list_first_entry(&vvs->rx_queue,
>  				       struct virtio_vsock_pkt, list);
> 
> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>  		 */
>  		spin_unlock_bh(&vvs->rx_lock);
> 
> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> -		if (err)
> -			goto out;
> +		for (i = 0; i < pkt->nr_vecs; i++) {
> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
> +				last_vec_total += pkt->vec[i].iov_len;
> +				continue;
> +			}
> +
> +			vec_off = pkt->off - last_vec_total;
> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> +					copy_bytes);
> +			if (err)
> +				goto out;
> +
> +			bytes -= copy_bytes;
> +			pkt->off += copy_bytes;
> +			total += copy_bytes;
> +			last_vec_total += pkt->vec[i].iov_len;
> +			if (!bytes)
> +				break;
> +		}
> 
>  		spin_lock_bh(&vvs->rx_lock);
> -
> -		total += bytes;
> -		pkt->off += bytes;
>  		if (pkt->off == pkt->len) {
>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>  			list_del(&pkt->list);
> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> 
>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>  {
> -	kfree(pkt->buf);
> -	kfree(pkt);
> +	int i;
> +
> +	if (pkt->mergeable) {
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> +		put_page(virt_to_head_page((void *)pkt));
> +	} else {
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			kfree(pkt->vec[i].iov_base);
> +		kfree(pkt);
> +	}
>  }
>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> 
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
  2018-12-12 15:31 ` Michael S. Tsirkin
@ 2018-12-12 15:31 ` Michael S. Tsirkin
  2018-12-13 16:20 ` Stefan Hajnoczi
  2018-12-13 16:20 ` Stefan Hajnoczi
  3 siblings, 0 replies; 13+ messages in thread
From: Michael S. Tsirkin @ 2018-12-12 15:31 UTC (permalink / raw)
  To: jiangyiwen; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization

On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> Guest receive mergeable rx buffer, it can merge
> scatter rx buffer into a big buffer and then copy
> to user space.
> 
> In addition, it also use iovec to replace buf in struct
> virtio_vsock_pkt, keep tx and rx consistency. The only
> difference is now tx still uses a segment of continuous
> physical memory to implement.
> 
> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> ---
>  drivers/vhost/vsock.c                   |  31 +++++++---
>  include/linux/virtio_vsock.h            |   6 +-
>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>  4 files changed, 166 insertions(+), 35 deletions(-)


This was supposed to be a guest patch, why is vhost changed here?

> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index dc52b0f..c7ab0dd 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  		size_t nbytes;
>  		size_t len;
>  		s16 headcount;
> +		size_t remain_len;
> +		int i;
> 
>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>  		if (list_empty(&vsock->send_pkt_list)) {
> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  			break;
>  		}
> 
> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> -		if (nbytes != pkt->len) {
> -			virtio_transport_free_pkt(pkt);
> -			vq_err(vq, "Faulted on copying pkt buf\n");
> -			break;
> +		remain_len = pkt->len;
> +		for (i = 0; i < pkt->nr_vecs; i++) {
> +			int tmp_len;
> +
> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
> +			if (nbytes != tmp_len) {
> +				virtio_transport_free_pkt(pkt);
> +				vq_err(vq, "Faulted on copying pkt buf\n");
> +				break;
> +			}
> +
> +			remain_len -= tmp_len;
>  		}
> 
>  		vhost_add_used_n(vq, vq->heads, headcount);
> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>  	struct iov_iter iov_iter;
>  	size_t nbytes;
>  	size_t len;
> +	void *buf;
> 
>  	if (in != 0) {
>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>  		return NULL;
>  	}
> 
> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> -	if (!pkt->buf) {
> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> +	if (!buf) {
>  		kfree(pkt);
>  		return NULL;
>  	}
> 
> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> +	pkt->vec[0].iov_base = buf;
> +	pkt->vec[0].iov_len = pkt->len;
> +	pkt->nr_vecs = 1;
> +
> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>  	if (nbytes != pkt->len) {
>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>  		       pkt->len, nbytes);
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index da9e1fe..734eeed 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -13,6 +13,8 @@
>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
> 
>  /* Virtio-vsock feature */
>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>  	struct list_head list;
>  	/* socket refcnt not held, only use for cancellation */
>  	struct vsock_sock *vsk;
> -	void *buf;
> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> +	int nr_vecs;
>  	u32 len;
>  	u32 off;
>  	bool reply;
> +	bool mergeable;
>  };
> 
>  struct virtio_vsock_pkt_info {
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index c4a465c..148b58a 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> 
>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>  		sgs[out_sg++] = &hdr;
> -		if (pkt->buf) {
> -			sg_init_one(&buf, pkt->buf, pkt->len);
> +		if (pkt->len) {
> +			/* Currently only support a segment of memory in tx */
> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>  			sgs[out_sg++] = &buf;
>  		}
> 
> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>  	struct virtio_vsock_pkt *pkt;
>  	struct scatterlist hdr, buf, *sgs[2];
>  	int ret;
> +	void *pkt_buf;
> 
>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>  	if (!pkt)
>  		return -ENOMEM;
> 
> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> -	if (!pkt->buf) {
> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> +	if (!pkt_buf) {
>  		virtio_transport_free_pkt(pkt);
>  		return -ENOMEM;
>  	}
> 
> +	pkt->vec[0].iov_base = pkt_buf;
> +	pkt->vec[0].iov_len = buf_len;
> +	pkt->nr_vecs = 1;
> +
>  	pkt->len = buf_len;
> 
>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>  	sgs[0] = &hdr;
> 
> -	sg_init_one(&buf, pkt->buf, buf_len);
> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>  	sgs[1] = &buf;
>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>  	if (ret)
> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>  	return val < virtqueue_get_vring_size(vq);
>  }
> 
> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> +		struct virtio_vsock *vsock, unsigned int *total_len)
> +{
> +	struct virtio_vsock_pkt *pkt;
> +	u16 num_buf;
> +	void *buf;
> +	unsigned int len;
> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> +
> +	buf = virtqueue_get_buf(vq, &len);
> +	if (!buf)
> +		return NULL;
> +
> +	*total_len = len;
> +	vsock->rx_buf_nr--;
> +
> +	if (unlikely(len < vsock_hlen)) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	pkt = buf;
> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}

So everything just stops going, and host and user don't even
know what the reason is. And not only that - the next
packet will be corrupted because we skipped the first one.



> +
> +	/* Initialize pkt residual structure */
> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> +
> +	pkt->mergeable = true;
> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> +	if (!pkt->len)
> +		return pkt;
> +
> +	len -= vsock_hlen;
> +	if (len) {
> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> +		/* Shared page with pkt, so get page in advance */
> +		get_page(virt_to_head_page(buf));
> +		pkt->nr_vecs++;
> +	}
> +
> +	while (--num_buf) {
> +		buf = virtqueue_get_buf(vq, &len);
> +		if (!buf)
> +			goto err;
> +
> +		*total_len += len;
> +		vsock->rx_buf_nr--;
> +
> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> +		pkt->nr_vecs++;
> +	}
> +
> +	return pkt;
> +err:
> +	virtio_transport_free_pkt(pkt);
> +	return NULL;
> +}
> +
>  static void virtio_transport_rx_work(struct work_struct *work)
>  {
>  	struct virtio_vsock *vsock =
>  		container_of(work, struct virtio_vsock, rx_work);
>  	struct virtqueue *vq;
> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> +			sizeof(struct virtio_vsock_hdr);
> 
>  	vq = vsock->vqs[VSOCK_VQ_RX];
> 
> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>  				goto out;
>  			}
> 
> -			pkt = virtqueue_get_buf(vq, &len);
> -			if (!pkt) {
> -				break;
> -			}
> +			if (likely(vsock->mergeable)) {
> +				pkt = receive_mergeable(vq, vsock, &len);
> +				if (!pkt)
> +					break;
> +			} else {
> +				pkt = virtqueue_get_buf(vq, &len);
> +				if (!pkt)
> +					break;
> 

So looking at it, this seems to be the main source of the gain.
But why does this require host/guest changes?


The way I see it:
	- get a buffer and create an skb
	- get the next one, check header matches, if yes
	  tack it on the skb as a fragment. If not then
	  don't, deliver previous one and queue the new one.



> -			vsock->rx_buf_nr--;
> +				vsock->rx_buf_nr--;
> +			}
> 
>  			/* Drop short/long packets */
> -			if (unlikely(len < sizeof(pkt->hdr) ||
> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> +			if (unlikely(len < vsock_hlen ||
> +				     len > vsock_hlen + pkt->len)) {
>  				virtio_transport_free_pkt(pkt);
>  				continue;
>  			}
> 
> -			pkt->len = len - sizeof(pkt->hdr);
> +			pkt->len = len - vsock_hlen;
>  			virtio_transport_deliver_tap_pkt(pkt);
>  			virtio_transport_recv_pkt(pkt);
>  		}
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 3ae3a33..123a8b6 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  {
>  	struct virtio_vsock_pkt *pkt;
>  	int err;
> +	void *buf = NULL;
> 
>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>  	if (!pkt)
> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  	pkt->vsk		= info->vsk;
> 
>  	if (info->msg && len > 0) {
> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> -		if (!pkt->buf)
> +		buf = kmalloc(len, GFP_KERNEL);
> +		if (!buf)
>  			goto out_pkt;
> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> +		err = memcpy_from_msg(buf, info->msg, len);
>  		if (err)
>  			goto out;
> +
> +		pkt->vec[0].iov_base = buf;
> +		pkt->vec[0].iov_len = len;
> +		pkt->nr_vecs = 1;
>  	}
> 
>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>  	return pkt;
> 
>  out:
> -	kfree(pkt->buf);
> +	kfree(buf);
>  out_pkt:
>  	kfree(pkt);
>  	return NULL;
> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	struct virtio_vsock_pkt *pkt = opaque;
>  	struct af_vsockmon_hdr *hdr;
>  	struct sk_buff *skb;
> +	int i;
> 
>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>  			GFP_ATOMIC);
> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> 
>  	if (pkt->len) {
> -		skb_put_data(skb, pkt->buf, pkt->len);
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>  	}
> 
>  	return skb;
> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> 
>  	spin_lock_bh(&vvs->rx_lock);
>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> +		int i;
> +
>  		pkt = list_first_entry(&vvs->rx_queue,
>  				       struct virtio_vsock_pkt, list);
> 
> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>  		 */
>  		spin_unlock_bh(&vvs->rx_lock);
> 
> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> -		if (err)
> -			goto out;
> +		for (i = 0; i < pkt->nr_vecs; i++) {
> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
> +				last_vec_total += pkt->vec[i].iov_len;
> +				continue;
> +			}
> +
> +			vec_off = pkt->off - last_vec_total;
> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> +					copy_bytes);
> +			if (err)
> +				goto out;
> +
> +			bytes -= copy_bytes;
> +			pkt->off += copy_bytes;
> +			total += copy_bytes;
> +			last_vec_total += pkt->vec[i].iov_len;
> +			if (!bytes)
> +				break;
> +		}
> 
>  		spin_lock_bh(&vvs->rx_lock);
> -
> -		total += bytes;
> -		pkt->off += bytes;
>  		if (pkt->off == pkt->len) {
>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>  			list_del(&pkt->list);
> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> 
>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>  {
> -	kfree(pkt->buf);
> -	kfree(pkt);
> +	int i;
> +
> +	if (pkt->mergeable) {
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> +		put_page(virt_to_head_page((void *)pkt));
> +	} else {
> +		for (i = 0; i < pkt->nr_vecs; i++)
> +			kfree(pkt->vec[i].iov_base);
> +		kfree(pkt);
> +	}
>  }
>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> 
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-12 15:31 ` Michael S. Tsirkin
@ 2018-12-13  2:38   ` jiangyiwen
  2018-12-13 14:29     ` Michael S. Tsirkin
  2018-12-13 14:29     ` Michael S. Tsirkin
  0 siblings, 2 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-13  2:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization

Hi Michael,

On 2018/12/12 23:31, Michael S. Tsirkin wrote:
> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>> Guest receive mergeable rx buffer, it can merge
>> scatter rx buffer into a big buffer and then copy
>> to user space.
>>
>> In addition, it also use iovec to replace buf in struct
>> virtio_vsock_pkt, keep tx and rx consistency. The only
>> difference is now tx still uses a segment of continuous
>> physical memory to implement.
>>
>> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
>> ---
>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>  include/linux/virtio_vsock.h            |   6 +-
>>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>>  4 files changed, 166 insertions(+), 35 deletions(-)
> 
> 
> This was supposed to be a guest patch, why is vhost changed here?
> 

In mergeable rx buff cases, it need to scatter big packets into several
buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
time, in order to keep tx and rx consistency, I use kvec to replace
variable buf, because vhost use the variable pkt->buf, so this patch
caused vhost is changed.

>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index dc52b0f..c7ab0dd 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>  		size_t nbytes;
>>  		size_t len;
>>  		s16 headcount;
>> +		size_t remain_len;
>> +		int i;
>>
>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>  		if (list_empty(&vsock->send_pkt_list)) {
>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>  			break;
>>  		}
>>
>> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>> -		if (nbytes != pkt->len) {
>> -			virtio_transport_free_pkt(pkt);
>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>> -			break;
>> +		remain_len = pkt->len;
>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>> +			int tmp_len;
>> +
>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
>> +			if (nbytes != tmp_len) {
>> +				virtio_transport_free_pkt(pkt);
>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>> +				break;
>> +			}
>> +
>> +			remain_len -= tmp_len;
>>  		}
>>
>>  		vhost_add_used_n(vq, vq->heads, headcount);
>> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>  	struct iov_iter iov_iter;
>>  	size_t nbytes;
>>  	size_t len;
>> +	void *buf;
>>
>>  	if (in != 0) {
>>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
>> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>  		return NULL;
>>  	}
>>
>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>> -	if (!pkt->buf) {
>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>> +	if (!buf) {
>>  		kfree(pkt);
>>  		return NULL;
>>  	}
>>
>> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>> +	pkt->vec[0].iov_base = buf;
>> +	pkt->vec[0].iov_len = pkt->len;
>> +	pkt->nr_vecs = 1;
>> +
>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>  	if (nbytes != pkt->len) {
>>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>  		       pkt->len, nbytes);
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index da9e1fe..734eeed 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -13,6 +13,8 @@
>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>
>>  /* Virtio-vsock feature */
>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>  	struct list_head list;
>>  	/* socket refcnt not held, only use for cancellation */
>>  	struct vsock_sock *vsk;
>> -	void *buf;
>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>> +	int nr_vecs;
>>  	u32 len;
>>  	u32 off;
>>  	bool reply;
>> +	bool mergeable;
>>  };
>>
>>  struct virtio_vsock_pkt_info {
>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>> index c4a465c..148b58a 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>
>>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>  		sgs[out_sg++] = &hdr;
>> -		if (pkt->buf) {
>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>> +		if (pkt->len) {
>> +			/* Currently only support a segment of memory in tx */
>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>>  			sgs[out_sg++] = &buf;
>>  		}
>>
>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>>  	struct virtio_vsock_pkt *pkt;
>>  	struct scatterlist hdr, buf, *sgs[2];
>>  	int ret;
>> +	void *pkt_buf;
>>
>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>  	if (!pkt)
>>  		return -ENOMEM;
>>
>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>> -	if (!pkt->buf) {
>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>> +	if (!pkt_buf) {
>>  		virtio_transport_free_pkt(pkt);
>>  		return -ENOMEM;
>>  	}
>>
>> +	pkt->vec[0].iov_base = pkt_buf;
>> +	pkt->vec[0].iov_len = buf_len;
>> +	pkt->nr_vecs = 1;
>> +
>>  	pkt->len = buf_len;
>>
>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>  	sgs[0] = &hdr;
>>
>> -	sg_init_one(&buf, pkt->buf, buf_len);
>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>  	sgs[1] = &buf;
>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>  	if (ret)
>> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>>  	return val < virtqueue_get_vring_size(vq);
>>  }
>>
>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>> +{
>> +	struct virtio_vsock_pkt *pkt;
>> +	u16 num_buf;
>> +	void *buf;
>> +	unsigned int len;
>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>> +
>> +	buf = virtqueue_get_buf(vq, &len);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	*total_len = len;
>> +	vsock->rx_buf_nr--;
>> +
>> +	if (unlikely(len < vsock_hlen)) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	pkt = buf;
>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
> 
> So everything just stops going, and host and user don't even
> know what the reason is. And not only that - the next
> packet will be corrupted because we skipped the first one.
> 
> 

I understand this case will not encountered unless the code has
*BUG*, like Host send some problematic packages (shorten/longer than
expected). In this case, I think we should ignore/drop these packets.

> 
>> +
>> +	/* Initialize pkt residual structure */
>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>> +
>> +	pkt->mergeable = true;
>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>> +	if (!pkt->len)
>> +		return pkt;
>> +
>> +	len -= vsock_hlen;
>> +	if (len) {
>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>> +		/* Shared page with pkt, so get page in advance */
>> +		get_page(virt_to_head_page(buf));
>> +		pkt->nr_vecs++;
>> +	}
>> +
>> +	while (--num_buf) {
>> +		buf = virtqueue_get_buf(vq, &len);
>> +		if (!buf)
>> +			goto err;
>> +
>> +		*total_len += len;
>> +		vsock->rx_buf_nr--;
>> +
>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>> +		pkt->nr_vecs++;
>> +	}
>> +
>> +	return pkt;
>> +err:
>> +	virtio_transport_free_pkt(pkt);
>> +	return NULL;
>> +}
>> +
>>  static void virtio_transport_rx_work(struct work_struct *work)
>>  {
>>  	struct virtio_vsock *vsock =
>>  		container_of(work, struct virtio_vsock, rx_work);
>>  	struct virtqueue *vq;
>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
>> +			sizeof(struct virtio_vsock_hdr);
>>
>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>
>> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>>  				goto out;
>>  			}
>>
>> -			pkt = virtqueue_get_buf(vq, &len);
>> -			if (!pkt) {
>> -				break;
>> -			}
>> +			if (likely(vsock->mergeable)) {
>> +				pkt = receive_mergeable(vq, vsock, &len);
>> +				if (!pkt)
>> +					break;
>> +			} else {
>> +				pkt = virtqueue_get_buf(vq, &len);
>> +				if (!pkt)
>> +					break;
>>
> 
> So looking at it, this seems to be the main source of the gain.
> But why does this require host/guest changes?
> 
> 
> The way I see it:
> 	- get a buffer and create an skb
> 	- get the next one, check header matches, if yes
> 	  tack it on the skb as a fragment. If not then
> 	  don't, deliver previous one and queue the new one.
> 
> 

Vhost change reason I explain as above, and I hope use kvec
to instead buf, after all buf only can express a contiguous
physical memory.

Thanks,
Yiwen.

> 
>> -			vsock->rx_buf_nr--;
>> +				vsock->rx_buf_nr--;
>> +			}
>>
>>  			/* Drop short/long packets */
>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>> +			if (unlikely(len < vsock_hlen ||
>> +				     len > vsock_hlen + pkt->len)) {
>>  				virtio_transport_free_pkt(pkt);
>>  				continue;
>>  			}
>>
>> -			pkt->len = len - sizeof(pkt->hdr);
>> +			pkt->len = len - vsock_hlen;
>>  			virtio_transport_deliver_tap_pkt(pkt);
>>  			virtio_transport_recv_pkt(pkt);
>>  		}
>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> index 3ae3a33..123a8b6 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>  {
>>  	struct virtio_vsock_pkt *pkt;
>>  	int err;
>> +	void *buf = NULL;
>>
>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>  	if (!pkt)
>> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>  	pkt->vsk		= info->vsk;
>>
>>  	if (info->msg && len > 0) {
>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>> -		if (!pkt->buf)
>> +		buf = kmalloc(len, GFP_KERNEL);
>> +		if (!buf)
>>  			goto out_pkt;
>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>> +		err = memcpy_from_msg(buf, info->msg, len);
>>  		if (err)
>>  			goto out;
>> +
>> +		pkt->vec[0].iov_base = buf;
>> +		pkt->vec[0].iov_len = len;
>> +		pkt->nr_vecs = 1;
>>  	}
>>
>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>  	return pkt;
>>
>>  out:
>> -	kfree(pkt->buf);
>> +	kfree(buf);
>>  out_pkt:
>>  	kfree(pkt);
>>  	return NULL;
>> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>  	struct virtio_vsock_pkt *pkt = opaque;
>>  	struct af_vsockmon_hdr *hdr;
>>  	struct sk_buff *skb;
>> +	int i;
>>
>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>>  			GFP_ATOMIC);
>> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>
>>  	if (pkt->len) {
>> -		skb_put_data(skb, pkt->buf, pkt->len);
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>>  	}
>>
>>  	return skb;
>> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>
>>  	spin_lock_bh(&vvs->rx_lock);
>>  	while (total < len && !list_empty(&vvs->rx_queue)) {
>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>> +		int i;
>> +
>>  		pkt = list_first_entry(&vvs->rx_queue,
>>  				       struct virtio_vsock_pkt, list);
>>
>> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>  		 */
>>  		spin_unlock_bh(&vvs->rx_lock);
>>
>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>> -		if (err)
>> -			goto out;
>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
>> +				last_vec_total += pkt->vec[i].iov_len;
>> +				continue;
>> +			}
>> +
>> +			vec_off = pkt->off - last_vec_total;
>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>> +					copy_bytes);
>> +			if (err)
>> +				goto out;
>> +
>> +			bytes -= copy_bytes;
>> +			pkt->off += copy_bytes;
>> +			total += copy_bytes;
>> +			last_vec_total += pkt->vec[i].iov_len;
>> +			if (!bytes)
>> +				break;
>> +		}
>>
>>  		spin_lock_bh(&vvs->rx_lock);
>> -
>> -		total += bytes;
>> -		pkt->off += bytes;
>>  		if (pkt->off == pkt->len) {
>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>  			list_del(&pkt->list);
>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>>
>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>  {
>> -	kfree(pkt->buf);
>> -	kfree(pkt);
>> +	int i;
>> +
>> +	if (pkt->mergeable) {
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>> +		put_page(virt_to_head_page((void *)pkt));
>> +	} else {
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			kfree(pkt->vec[i].iov_base);
>> +		kfree(pkt);
>> +	}
>>  }
>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>
>> -- 
>> 1.8.3.1
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13  2:38   ` jiangyiwen
  2018-12-13 14:29     ` Michael S. Tsirkin
@ 2018-12-13 14:29     ` Michael S. Tsirkin
  2018-12-14  8:11       ` jiangyiwen
  2018-12-14  8:11       ` jiangyiwen
  1 sibling, 2 replies; 13+ messages in thread
From: Michael S. Tsirkin @ 2018-12-13 14:29 UTC (permalink / raw)
  To: jiangyiwen; +Cc: Stefan Hajnoczi, Jason Wang, netdev, kvm, virtualization

On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
> Hi Michael,
> 
> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> >> Guest receive mergeable rx buffer, it can merge
> >> scatter rx buffer into a big buffer and then copy
> >> to user space.
> >>
> >> In addition, it also use iovec to replace buf in struct
> >> virtio_vsock_pkt, keep tx and rx consistency. The only
> >> difference is now tx still uses a segment of continuous
> >> physical memory to implement.
> >>
> >> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> >> ---
> >>  drivers/vhost/vsock.c                   |  31 +++++++---
> >>  include/linux/virtio_vsock.h            |   6 +-
> >>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
> >>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
> >>  4 files changed, 166 insertions(+), 35 deletions(-)
> > 
> > 
> > This was supposed to be a guest patch, why is vhost changed here?
> > 
> 
> In mergeable rx buff cases, it need to scatter big packets into several
> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
> time, in order to keep tx and rx consistency, I use kvec to replace
> variable buf, because vhost use the variable pkt->buf, so this patch
> caused vhost is changed.

You'd want to split these patches imho.

> >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> >> index dc52b0f..c7ab0dd 100644
> >> --- a/drivers/vhost/vsock.c
> >> +++ b/drivers/vhost/vsock.c
> >> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  		size_t nbytes;
> >>  		size_t len;
> >>  		s16 headcount;
> >> +		size_t remain_len;
> >> +		int i;
> >>
> >>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> >>  		if (list_empty(&vsock->send_pkt_list)) {
> >> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  			break;
> >>  		}
> >>
> >> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> >> -		if (nbytes != pkt->len) {
> >> -			virtio_transport_free_pkt(pkt);
> >> -			vq_err(vq, "Faulted on copying pkt buf\n");
> >> -			break;
> >> +		remain_len = pkt->len;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			int tmp_len;
> >> +
> >> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> >> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
> >> +			if (nbytes != tmp_len) {
> >> +				virtio_transport_free_pkt(pkt);
> >> +				vq_err(vq, "Faulted on copying pkt buf\n");
> >> +				break;
> >> +			}
> >> +
> >> +			remain_len -= tmp_len;
> >>  		}
> >>
> >>  		vhost_add_used_n(vq, vq->heads, headcount);
> >> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  	struct iov_iter iov_iter;
> >>  	size_t nbytes;
> >>  	size_t len;
> >> +	void *buf;
> >>
> >>  	if (in != 0) {
> >>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> >> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  		return NULL;
> >>  	}
> >>
> >> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> >> +	if (!buf) {
> >>  		kfree(pkt);
> >>  		return NULL;
> >>  	}
> >>
> >> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> >> +	pkt->vec[0].iov_base = buf;
> >> +	pkt->vec[0].iov_len = pkt->len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
> >>  	if (nbytes != pkt->len) {
> >>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> >>  		       pkt->len, nbytes);
> >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> >> index da9e1fe..734eeed 100644
> >> --- a/include/linux/virtio_vsock.h
> >> +++ b/include/linux/virtio_vsock.h
> >> @@ -13,6 +13,8 @@
> >>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
> >>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
> >>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> >> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> >> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
> >>
> >>  /* Virtio-vsock feature */
> >>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> >> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
> >>  	struct list_head list;
> >>  	/* socket refcnt not held, only use for cancellation */
> >>  	struct vsock_sock *vsk;
> >> -	void *buf;
> >> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> >> +	int nr_vecs;
> >>  	u32 len;
> >>  	u32 off;
> >>  	bool reply;
> >> +	bool mergeable;
> >>  };
> >>
> >>  struct virtio_vsock_pkt_info {
> >> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> >> index c4a465c..148b58a 100644
> >> --- a/net/vmw_vsock/virtio_transport.c
> >> +++ b/net/vmw_vsock/virtio_transport.c
> >> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> >>
> >>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  		sgs[out_sg++] = &hdr;
> >> -		if (pkt->buf) {
> >> -			sg_init_one(&buf, pkt->buf, pkt->len);
> >> +		if (pkt->len) {
> >> +			/* Currently only support a segment of memory in tx */
> >> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> >> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
> >>  			sgs[out_sg++] = &buf;
> >>  		}
> >>
> >> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	struct scatterlist hdr, buf, *sgs[2];
> >>  	int ret;
> >> +	void *pkt_buf;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >>  		return -ENOMEM;
> >>
> >> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> >> +	if (!pkt_buf) {
> >>  		virtio_transport_free_pkt(pkt);
> >>  		return -ENOMEM;
> >>  	}
> >>
> >> +	pkt->vec[0].iov_base = pkt_buf;
> >> +	pkt->vec[0].iov_len = buf_len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >>  	pkt->len = buf_len;
> >>
> >>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  	sgs[0] = &hdr;
> >>
> >> -	sg_init_one(&buf, pkt->buf, buf_len);
> >> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
> >>  	sgs[1] = &buf;
> >>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
> >>  	if (ret)
> >> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
> >>  	return val < virtqueue_get_vring_size(vq);
> >>  }
> >>
> >> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> >> +		struct virtio_vsock *vsock, unsigned int *total_len)
> >> +{
> >> +	struct virtio_vsock_pkt *pkt;
> >> +	u16 num_buf;
> >> +	void *buf;
> >> +	unsigned int len;
> >> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> >> +
> >> +	buf = virtqueue_get_buf(vq, &len);
> >> +	if (!buf)
> >> +		return NULL;
> >> +
> >> +	*total_len = len;
> >> +	vsock->rx_buf_nr--;
> >> +
> >> +	if (unlikely(len < vsock_hlen)) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> >> +
> >> +	pkt = buf;
> >> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> >> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> > 
> > So everything just stops going, and host and user don't even
> > know what the reason is. And not only that - the next
> > packet will be corrupted because we skipped the first one.
> > 
> > 
> 
> I understand this case will not encountered unless the code has
> *BUG*, like Host send some problematic packages (shorten/longer than
> expected). In this case, I think we should ignore/drop these packets.

If there's a specific packet length expected, e.g.  in this case vector
size, it needs to be negotiated between host and guest in some way.

> > 
> >> +
> >> +	/* Initialize pkt residual structure */
> >> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> >> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> >> +
> >> +	pkt->mergeable = true;
> >> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> >> +	if (!pkt->len)
> >> +		return pkt;
> >> +
> >> +	len -= vsock_hlen;
> >> +	if (len) {
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		/* Shared page with pkt, so get page in advance */
> >> +		get_page(virt_to_head_page(buf));
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	while (--num_buf) {
> >> +		buf = virtqueue_get_buf(vq, &len);
> >> +		if (!buf)
> >> +			goto err;
> >> +
> >> +		*total_len += len;
> >> +		vsock->rx_buf_nr--;
> >> +
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	return pkt;
> >> +err:
> >> +	virtio_transport_free_pkt(pkt);
> >> +	return NULL;
> >> +}
> >> +
> >>  static void virtio_transport_rx_work(struct work_struct *work)
> >>  {
> >>  	struct virtio_vsock *vsock =
> >>  		container_of(work, struct virtio_vsock, rx_work);
> >>  	struct virtqueue *vq;
> >> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> >> +			sizeof(struct virtio_vsock_hdr);
> >>
> >>  	vq = vsock->vqs[VSOCK_VQ_RX];
> >>
> >> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
> >>  				goto out;
> >>  			}
> >>
> >> -			pkt = virtqueue_get_buf(vq, &len);
> >> -			if (!pkt) {
> >> -				break;
> >> -			}
> >> +			if (likely(vsock->mergeable)) {
> >> +				pkt = receive_mergeable(vq, vsock, &len);
> >> +				if (!pkt)
> >> +					break;
> >> +			} else {
> >> +				pkt = virtqueue_get_buf(vq, &len);
> >> +				if (!pkt)
> >> +					break;
> >>
> > 
> > So looking at it, this seems to be the main source of the gain.
> > But why does this require host/guest changes?
> > 
> > 
> > The way I see it:
> > 	- get a buffer and create an skb
> > 	- get the next one, check header matches, if yes
> > 	  tack it on the skb as a fragment. If not then
> > 	  don't, deliver previous one and queue the new one.
> > 
> > 
> 
> Vhost change reason I explain as above, and I hope use kvec
> to instead buf, after all buf only can express a contiguous
> physical memory.
> 
> Thanks,
> Yiwen.


I got the reason but I am not yet convinced it's a good one.
You don't necessarily need host to skip headers
in all but the first chunk. Guest can do this just as well.



> > 
> >> -			vsock->rx_buf_nr--;
> >> +				vsock->rx_buf_nr--;
> >> +			}
> >>
> >>  			/* Drop short/long packets */
> >> -			if (unlikely(len < sizeof(pkt->hdr) ||
> >> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> >> +			if (unlikely(len < vsock_hlen ||
> >> +				     len > vsock_hlen + pkt->len)) {
> >>  				virtio_transport_free_pkt(pkt);
> >>  				continue;
> >>  			}
> >>
> >> -			pkt->len = len - sizeof(pkt->hdr);
> >> +			pkt->len = len - vsock_hlen;
> >>  			virtio_transport_deliver_tap_pkt(pkt);
> >>  			virtio_transport_recv_pkt(pkt);
> >>  		}
> >> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> >> index 3ae3a33..123a8b6 100644
> >> --- a/net/vmw_vsock/virtio_transport_common.c
> >> +++ b/net/vmw_vsock/virtio_transport_common.c
> >> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  {
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	int err;
> >> +	void *buf = NULL;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	pkt->vsk		= info->vsk;
> >>
> >>  	if (info->msg && len > 0) {
> >> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> >> -		if (!pkt->buf)
> >> +		buf = kmalloc(len, GFP_KERNEL);
> >> +		if (!buf)
> >>  			goto out_pkt;
> >> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >> +		err = memcpy_from_msg(buf, info->msg, len);
> >>  		if (err)
> >>  			goto out;
> >> +
> >> +		pkt->vec[0].iov_base = buf;
> >> +		pkt->vec[0].iov_len = len;
> >> +		pkt->nr_vecs = 1;
> >>  	}
> >>
> >>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> >> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	return pkt;
> >>
> >>  out:
> >> -	kfree(pkt->buf);
> >> +	kfree(buf);
> >>  out_pkt:
> >>  	kfree(pkt);
> >>  	return NULL;
> >> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	struct virtio_vsock_pkt *pkt = opaque;
> >>  	struct af_vsockmon_hdr *hdr;
> >>  	struct sk_buff *skb;
> >> +	int i;
> >>
> >>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
> >>  			GFP_ATOMIC);
> >> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> >>
> >>  	if (pkt->len) {
> >> -		skb_put_data(skb, pkt->buf, pkt->len);
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
> >>  	}
> >>
> >>  	return skb;
> >> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>
> >>  	spin_lock_bh(&vvs->rx_lock);
> >>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> >> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> >> +		int i;
> >> +
> >>  		pkt = list_first_entry(&vvs->rx_queue,
> >>  				       struct virtio_vsock_pkt, list);
> >>
> >> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>  		 */
> >>  		spin_unlock_bh(&vvs->rx_lock);
> >>
> >> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> >> -		if (err)
> >> -			goto out;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
> >> +				last_vec_total += pkt->vec[i].iov_len;
> >> +				continue;
> >> +			}
> >> +
> >> +			vec_off = pkt->off - last_vec_total;
> >> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> >> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> >> +					copy_bytes);
> >> +			if (err)
> >> +				goto out;
> >> +
> >> +			bytes -= copy_bytes;
> >> +			pkt->off += copy_bytes;
> >> +			total += copy_bytes;
> >> +			last_vec_total += pkt->vec[i].iov_len;
> >> +			if (!bytes)
> >> +				break;
> >> +		}
> >>
> >>  		spin_lock_bh(&vvs->rx_lock);
> >> -
> >> -		total += bytes;
> >> -		pkt->off += bytes;
> >>  		if (pkt->off == pkt->len) {
> >>  			virtio_transport_dec_rx_pkt(vvs, pkt);
> >>  			list_del(&pkt->list);
> >> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> >>
> >>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> >>  {
> >> -	kfree(pkt->buf);
> >> -	kfree(pkt);
> >> +	int i;
> >> +
> >> +	if (pkt->mergeable) {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> >> +		put_page(virt_to_head_page((void *)pkt));
> >> +	} else {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			kfree(pkt->vec[i].iov_base);
> >> +		kfree(pkt);
> >> +	}
> >>  }
> >>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> >>
> >> -- 
> >> 1.8.3.1
> >>
> > 
> > .
> > 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13  2:38   ` jiangyiwen
@ 2018-12-13 14:29     ` Michael S. Tsirkin
  2018-12-13 14:29     ` Michael S. Tsirkin
  1 sibling, 0 replies; 13+ messages in thread
From: Michael S. Tsirkin @ 2018-12-13 14:29 UTC (permalink / raw)
  To: jiangyiwen; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization

On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
> Hi Michael,
> 
> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> >> Guest receive mergeable rx buffer, it can merge
> >> scatter rx buffer into a big buffer and then copy
> >> to user space.
> >>
> >> In addition, it also use iovec to replace buf in struct
> >> virtio_vsock_pkt, keep tx and rx consistency. The only
> >> difference is now tx still uses a segment of continuous
> >> physical memory to implement.
> >>
> >> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> >> ---
> >>  drivers/vhost/vsock.c                   |  31 +++++++---
> >>  include/linux/virtio_vsock.h            |   6 +-
> >>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
> >>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
> >>  4 files changed, 166 insertions(+), 35 deletions(-)
> > 
> > 
> > This was supposed to be a guest patch, why is vhost changed here?
> > 
> 
> In mergeable rx buff cases, it need to scatter big packets into several
> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
> time, in order to keep tx and rx consistency, I use kvec to replace
> variable buf, because vhost use the variable pkt->buf, so this patch
> caused vhost is changed.

You'd want to split these patches imho.

> >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> >> index dc52b0f..c7ab0dd 100644
> >> --- a/drivers/vhost/vsock.c
> >> +++ b/drivers/vhost/vsock.c
> >> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  		size_t nbytes;
> >>  		size_t len;
> >>  		s16 headcount;
> >> +		size_t remain_len;
> >> +		int i;
> >>
> >>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> >>  		if (list_empty(&vsock->send_pkt_list)) {
> >> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  			break;
> >>  		}
> >>
> >> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> >> -		if (nbytes != pkt->len) {
> >> -			virtio_transport_free_pkt(pkt);
> >> -			vq_err(vq, "Faulted on copying pkt buf\n");
> >> -			break;
> >> +		remain_len = pkt->len;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			int tmp_len;
> >> +
> >> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> >> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
> >> +			if (nbytes != tmp_len) {
> >> +				virtio_transport_free_pkt(pkt);
> >> +				vq_err(vq, "Faulted on copying pkt buf\n");
> >> +				break;
> >> +			}
> >> +
> >> +			remain_len -= tmp_len;
> >>  		}
> >>
> >>  		vhost_add_used_n(vq, vq->heads, headcount);
> >> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  	struct iov_iter iov_iter;
> >>  	size_t nbytes;
> >>  	size_t len;
> >> +	void *buf;
> >>
> >>  	if (in != 0) {
> >>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> >> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  		return NULL;
> >>  	}
> >>
> >> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> >> +	if (!buf) {
> >>  		kfree(pkt);
> >>  		return NULL;
> >>  	}
> >>
> >> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> >> +	pkt->vec[0].iov_base = buf;
> >> +	pkt->vec[0].iov_len = pkt->len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
> >>  	if (nbytes != pkt->len) {
> >>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> >>  		       pkt->len, nbytes);
> >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> >> index da9e1fe..734eeed 100644
> >> --- a/include/linux/virtio_vsock.h
> >> +++ b/include/linux/virtio_vsock.h
> >> @@ -13,6 +13,8 @@
> >>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
> >>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
> >>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> >> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> >> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
> >>
> >>  /* Virtio-vsock feature */
> >>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> >> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
> >>  	struct list_head list;
> >>  	/* socket refcnt not held, only use for cancellation */
> >>  	struct vsock_sock *vsk;
> >> -	void *buf;
> >> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> >> +	int nr_vecs;
> >>  	u32 len;
> >>  	u32 off;
> >>  	bool reply;
> >> +	bool mergeable;
> >>  };
> >>
> >>  struct virtio_vsock_pkt_info {
> >> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> >> index c4a465c..148b58a 100644
> >> --- a/net/vmw_vsock/virtio_transport.c
> >> +++ b/net/vmw_vsock/virtio_transport.c
> >> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> >>
> >>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  		sgs[out_sg++] = &hdr;
> >> -		if (pkt->buf) {
> >> -			sg_init_one(&buf, pkt->buf, pkt->len);
> >> +		if (pkt->len) {
> >> +			/* Currently only support a segment of memory in tx */
> >> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> >> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
> >>  			sgs[out_sg++] = &buf;
> >>  		}
> >>
> >> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	struct scatterlist hdr, buf, *sgs[2];
> >>  	int ret;
> >> +	void *pkt_buf;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >>  		return -ENOMEM;
> >>
> >> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> >> +	if (!pkt_buf) {
> >>  		virtio_transport_free_pkt(pkt);
> >>  		return -ENOMEM;
> >>  	}
> >>
> >> +	pkt->vec[0].iov_base = pkt_buf;
> >> +	pkt->vec[0].iov_len = buf_len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >>  	pkt->len = buf_len;
> >>
> >>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  	sgs[0] = &hdr;
> >>
> >> -	sg_init_one(&buf, pkt->buf, buf_len);
> >> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
> >>  	sgs[1] = &buf;
> >>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
> >>  	if (ret)
> >> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
> >>  	return val < virtqueue_get_vring_size(vq);
> >>  }
> >>
> >> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> >> +		struct virtio_vsock *vsock, unsigned int *total_len)
> >> +{
> >> +	struct virtio_vsock_pkt *pkt;
> >> +	u16 num_buf;
> >> +	void *buf;
> >> +	unsigned int len;
> >> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> >> +
> >> +	buf = virtqueue_get_buf(vq, &len);
> >> +	if (!buf)
> >> +		return NULL;
> >> +
> >> +	*total_len = len;
> >> +	vsock->rx_buf_nr--;
> >> +
> >> +	if (unlikely(len < vsock_hlen)) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> >> +
> >> +	pkt = buf;
> >> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> >> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> > 
> > So everything just stops going, and host and user don't even
> > know what the reason is. And not only that - the next
> > packet will be corrupted because we skipped the first one.
> > 
> > 
> 
> I understand this case will not encountered unless the code has
> *BUG*, like Host send some problematic packages (shorten/longer than
> expected). In this case, I think we should ignore/drop these packets.

If there's a specific packet length expected, e.g.  in this case vector
size, it needs to be negotiated between host and guest in some way.

> > 
> >> +
> >> +	/* Initialize pkt residual structure */
> >> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> >> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> >> +
> >> +	pkt->mergeable = true;
> >> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> >> +	if (!pkt->len)
> >> +		return pkt;
> >> +
> >> +	len -= vsock_hlen;
> >> +	if (len) {
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		/* Shared page with pkt, so get page in advance */
> >> +		get_page(virt_to_head_page(buf));
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	while (--num_buf) {
> >> +		buf = virtqueue_get_buf(vq, &len);
> >> +		if (!buf)
> >> +			goto err;
> >> +
> >> +		*total_len += len;
> >> +		vsock->rx_buf_nr--;
> >> +
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	return pkt;
> >> +err:
> >> +	virtio_transport_free_pkt(pkt);
> >> +	return NULL;
> >> +}
> >> +
> >>  static void virtio_transport_rx_work(struct work_struct *work)
> >>  {
> >>  	struct virtio_vsock *vsock =
> >>  		container_of(work, struct virtio_vsock, rx_work);
> >>  	struct virtqueue *vq;
> >> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> >> +			sizeof(struct virtio_vsock_hdr);
> >>
> >>  	vq = vsock->vqs[VSOCK_VQ_RX];
> >>
> >> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
> >>  				goto out;
> >>  			}
> >>
> >> -			pkt = virtqueue_get_buf(vq, &len);
> >> -			if (!pkt) {
> >> -				break;
> >> -			}
> >> +			if (likely(vsock->mergeable)) {
> >> +				pkt = receive_mergeable(vq, vsock, &len);
> >> +				if (!pkt)
> >> +					break;
> >> +			} else {
> >> +				pkt = virtqueue_get_buf(vq, &len);
> >> +				if (!pkt)
> >> +					break;
> >>
> > 
> > So looking at it, this seems to be the main source of the gain.
> > But why does this require host/guest changes?
> > 
> > 
> > The way I see it:
> > 	- get a buffer and create an skb
> > 	- get the next one, check header matches, if yes
> > 	  tack it on the skb as a fragment. If not then
> > 	  don't, deliver previous one and queue the new one.
> > 
> > 
> 
> Vhost change reason I explain as above, and I hope use kvec
> to instead buf, after all buf only can express a contiguous
> physical memory.
> 
> Thanks,
> Yiwen.


I got the reason but I am not yet convinced it's a good one.
You don't necessarily need host to skip headers
in all but the first chunk. Guest can do this just as well.



> > 
> >> -			vsock->rx_buf_nr--;
> >> +				vsock->rx_buf_nr--;
> >> +			}
> >>
> >>  			/* Drop short/long packets */
> >> -			if (unlikely(len < sizeof(pkt->hdr) ||
> >> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> >> +			if (unlikely(len < vsock_hlen ||
> >> +				     len > vsock_hlen + pkt->len)) {
> >>  				virtio_transport_free_pkt(pkt);
> >>  				continue;
> >>  			}
> >>
> >> -			pkt->len = len - sizeof(pkt->hdr);
> >> +			pkt->len = len - vsock_hlen;
> >>  			virtio_transport_deliver_tap_pkt(pkt);
> >>  			virtio_transport_recv_pkt(pkt);
> >>  		}
> >> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> >> index 3ae3a33..123a8b6 100644
> >> --- a/net/vmw_vsock/virtio_transport_common.c
> >> +++ b/net/vmw_vsock/virtio_transport_common.c
> >> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  {
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	int err;
> >> +	void *buf = NULL;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	pkt->vsk		= info->vsk;
> >>
> >>  	if (info->msg && len > 0) {
> >> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> >> -		if (!pkt->buf)
> >> +		buf = kmalloc(len, GFP_KERNEL);
> >> +		if (!buf)
> >>  			goto out_pkt;
> >> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >> +		err = memcpy_from_msg(buf, info->msg, len);
> >>  		if (err)
> >>  			goto out;
> >> +
> >> +		pkt->vec[0].iov_base = buf;
> >> +		pkt->vec[0].iov_len = len;
> >> +		pkt->nr_vecs = 1;
> >>  	}
> >>
> >>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> >> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	return pkt;
> >>
> >>  out:
> >> -	kfree(pkt->buf);
> >> +	kfree(buf);
> >>  out_pkt:
> >>  	kfree(pkt);
> >>  	return NULL;
> >> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	struct virtio_vsock_pkt *pkt = opaque;
> >>  	struct af_vsockmon_hdr *hdr;
> >>  	struct sk_buff *skb;
> >> +	int i;
> >>
> >>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
> >>  			GFP_ATOMIC);
> >> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> >>
> >>  	if (pkt->len) {
> >> -		skb_put_data(skb, pkt->buf, pkt->len);
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
> >>  	}
> >>
> >>  	return skb;
> >> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>
> >>  	spin_lock_bh(&vvs->rx_lock);
> >>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> >> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> >> +		int i;
> >> +
> >>  		pkt = list_first_entry(&vvs->rx_queue,
> >>  				       struct virtio_vsock_pkt, list);
> >>
> >> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>  		 */
> >>  		spin_unlock_bh(&vvs->rx_lock);
> >>
> >> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> >> -		if (err)
> >> -			goto out;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
> >> +				last_vec_total += pkt->vec[i].iov_len;
> >> +				continue;
> >> +			}
> >> +
> >> +			vec_off = pkt->off - last_vec_total;
> >> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> >> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> >> +					copy_bytes);
> >> +			if (err)
> >> +				goto out;
> >> +
> >> +			bytes -= copy_bytes;
> >> +			pkt->off += copy_bytes;
> >> +			total += copy_bytes;
> >> +			last_vec_total += pkt->vec[i].iov_len;
> >> +			if (!bytes)
> >> +				break;
> >> +		}
> >>
> >>  		spin_lock_bh(&vvs->rx_lock);
> >> -
> >> -		total += bytes;
> >> -		pkt->off += bytes;
> >>  		if (pkt->off == pkt->len) {
> >>  			virtio_transport_dec_rx_pkt(vvs, pkt);
> >>  			list_del(&pkt->list);
> >> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> >>
> >>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> >>  {
> >> -	kfree(pkt->buf);
> >> -	kfree(pkt);
> >> +	int i;
> >> +
> >> +	if (pkt->mergeable) {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> >> +		put_page(virt_to_head_page((void *)pkt));
> >> +	} else {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			kfree(pkt->vec[i].iov_base);
> >> +		kfree(pkt);
> >> +	}
> >>  }
> >>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> >>
> >> -- 
> >> 1.8.3.1
> >>
> > 
> > .
> > 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
  2018-12-12 15:31 ` Michael S. Tsirkin
  2018-12-12 15:31 ` Michael S. Tsirkin
@ 2018-12-13 16:20 ` Stefan Hajnoczi
  2018-12-14  8:41   ` jiangyiwen
  2018-12-14  8:41   ` jiangyiwen
  2018-12-13 16:20 ` Stefan Hajnoczi
  3 siblings, 2 replies; 13+ messages in thread
From: Stefan Hajnoczi @ 2018-12-13 16:20 UTC (permalink / raw)
  To: jiangyiwen; +Cc: Michael S. Tsirkin, Jason Wang, netdev, kvm, virtualization

[-- Attachment #1: Type: text/plain, Size: 1750 bytes --]

On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> +		struct virtio_vsock *vsock, unsigned int *total_len)
> +{
> +	struct virtio_vsock_pkt *pkt;
> +	u16 num_buf;
> +	void *buf;
> +	unsigned int len;
> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> +
> +	buf = virtqueue_get_buf(vq, &len);
> +	if (!buf)
> +		return NULL;
> +
> +	*total_len = len;
> +	vsock->rx_buf_nr--;
> +
> +	if (unlikely(len < vsock_hlen)) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	pkt = buf;
> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	/* Initialize pkt residual structure */
> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));

struct virtio_vsock_pkt is an internal driver state structure.  It must
be able to change without breaking the host<->guest device interface.
Exposing struct virtio_vsock_pkt across the device interface makes this
impossible.

One way to solve this is by placing a header length field at the
beginning of the mergeable buffer.  That way the device knows at which
offset the payload should be placed and the driver can continue to use
the allocation scheme you've chosen (where a single page contains the
virtio_vsock_pkt and payload).

Another issue is that virtio requires exclusive read OR write
descriptors.  You cannot have a descriptor that is both read and write.
So there's not a huge advantage to combining the driver state struct and
payload, except maybe the driver can save a memory allocation.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
                   ` (2 preceding siblings ...)
  2018-12-13 16:20 ` Stefan Hajnoczi
@ 2018-12-13 16:20 ` Stefan Hajnoczi
  3 siblings, 0 replies; 13+ messages in thread
From: Stefan Hajnoczi @ 2018-12-13 16:20 UTC (permalink / raw)
  To: jiangyiwen; +Cc: netdev, virtualization, kvm, Michael S. Tsirkin


[-- Attachment #1.1: Type: text/plain, Size: 1750 bytes --]

On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> +		struct virtio_vsock *vsock, unsigned int *total_len)
> +{
> +	struct virtio_vsock_pkt *pkt;
> +	u16 num_buf;
> +	void *buf;
> +	unsigned int len;
> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> +
> +	buf = virtqueue_get_buf(vq, &len);
> +	if (!buf)
> +		return NULL;
> +
> +	*total_len = len;
> +	vsock->rx_buf_nr--;
> +
> +	if (unlikely(len < vsock_hlen)) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	pkt = buf;
> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> +		put_page(virt_to_head_page(buf));
> +		return NULL;
> +	}
> +
> +	/* Initialize pkt residual structure */
> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));

struct virtio_vsock_pkt is an internal driver state structure.  It must
be able to change without breaking the host<->guest device interface.
Exposing struct virtio_vsock_pkt across the device interface makes this
impossible.

One way to solve this is by placing a header length field at the
beginning of the mergeable buffer.  That way the device knows at which
offset the payload should be placed and the driver can continue to use
the allocation scheme you've chosen (where a single page contains the
virtio_vsock_pkt and payload).

Another issue is that virtio requires exclusive read OR write
descriptors.  You cannot have a descriptor that is both read and write.
So there's not a huge advantage to combining the driver state struct and
payload, except maybe the driver can save a memory allocation.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13 14:29     ` Michael S. Tsirkin
  2018-12-14  8:11       ` jiangyiwen
@ 2018-12-14  8:11       ` jiangyiwen
  1 sibling, 0 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-14  8:11 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Stefan Hajnoczi, Jason Wang, netdev, kvm, virtualization

On 2018/12/13 22:29, Michael S. Tsirkin wrote:
> On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
>> Hi Michael,
>>
>> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
>>> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>>>> Guest receive mergeable rx buffer, it can merge
>>>> scatter rx buffer into a big buffer and then copy
>>>> to user space.
>>>>
>>>> In addition, it also use iovec to replace buf in struct
>>>> virtio_vsock_pkt, keep tx and rx consistency. The only
>>>> difference is now tx still uses a segment of continuous
>>>> physical memory to implement.
>>>>
>>>> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
>>>> ---
>>>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>>>  include/linux/virtio_vsock.h            |   6 +-
>>>>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>>>>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>>>>  4 files changed, 166 insertions(+), 35 deletions(-)
>>>
>>>
>>> This was supposed to be a guest patch, why is vhost changed here?
>>>
>>
>> In mergeable rx buff cases, it need to scatter big packets into several
>> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
>> time, in order to keep tx and rx consistency, I use kvec to replace
>> variable buf, because vhost use the variable pkt->buf, so this patch
>> caused vhost is changed.
> 
> You'd want to split these patches imho.
> 

Ok, I should do it.

>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index dc52b0f..c7ab0dd 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  		size_t nbytes;
>>>>  		size_t len;
>>>>  		s16 headcount;
>>>> +		size_t remain_len;
>>>> +		int i;
>>>>
>>>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>>>  		if (list_empty(&vsock->send_pkt_list)) {
>>>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  			break;
>>>>  		}
>>>>
>>>> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>>>> -		if (nbytes != pkt->len) {
>>>> -			virtio_transport_free_pkt(pkt);
>>>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>>>> -			break;
>>>> +		remain_len = pkt->len;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			int tmp_len;
>>>> +
>>>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>>>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
>>>> +			if (nbytes != tmp_len) {
>>>> +				virtio_transport_free_pkt(pkt);
>>>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>>>> +				break;
>>>> +			}
>>>> +
>>>> +			remain_len -= tmp_len;
>>>>  		}
>>>>
>>>>  		vhost_add_used_n(vq, vq->heads, headcount);
>>>> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  	struct iov_iter iov_iter;
>>>>  	size_t nbytes;
>>>>  	size_t len;
>>>> +	void *buf;
>>>>
>>>>  	if (in != 0) {
>>>>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
>>>> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> +	if (!buf) {
>>>>  		kfree(pkt);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>> +	pkt->vec[0].iov_base = buf;
>>>> +	pkt->vec[0].iov_len = pkt->len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>>>  	if (nbytes != pkt->len) {
>>>>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>>>  		       pkt->len, nbytes);
>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>> index da9e1fe..734eeed 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -13,6 +13,8 @@
>>>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>>> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>>
>>>>  /* Virtio-vsock feature */
>>>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>>>  	struct list_head list;
>>>>  	/* socket refcnt not held, only use for cancellation */
>>>>  	struct vsock_sock *vsk;
>>>> -	void *buf;
>>>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>>>> +	int nr_vecs;
>>>>  	u32 len;
>>>>  	u32 off;
>>>>  	bool reply;
>>>> +	bool mergeable;
>>>>  };
>>>>
>>>>  struct virtio_vsock_pkt_info {
>>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>>> index c4a465c..148b58a 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>>>
>>>>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  		sgs[out_sg++] = &hdr;
>>>> -		if (pkt->buf) {
>>>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>>>> +		if (pkt->len) {
>>>> +			/* Currently only support a segment of memory in tx */
>>>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>>>> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>>>>  			sgs[out_sg++] = &buf;
>>>>  		}
>>>>
>>>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	struct scatterlist hdr, buf, *sgs[2];
>>>>  	int ret;
>>>> +	void *pkt_buf;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>>  		return -ENOMEM;
>>>>
>>>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>>>> +	if (!pkt_buf) {
>>>>  		virtio_transport_free_pkt(pkt);
>>>>  		return -ENOMEM;
>>>>  	}
>>>>
>>>> +	pkt->vec[0].iov_base = pkt_buf;
>>>> +	pkt->vec[0].iov_len = buf_len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>>  	pkt->len = buf_len;
>>>>
>>>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  	sgs[0] = &hdr;
>>>>
>>>> -	sg_init_one(&buf, pkt->buf, buf_len);
>>>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>>>  	sgs[1] = &buf;
>>>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>>>  	if (ret)
>>>> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>>>>  	return val < virtqueue_get_vring_size(vq);
>>>>  }
>>>>
>>>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>>>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>>>> +{
>>>> +	struct virtio_vsock_pkt *pkt;
>>>> +	u16 num_buf;
>>>> +	void *buf;
>>>> +	unsigned int len;
>>>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>>>> +
>>>> +	buf = virtqueue_get_buf(vq, &len);
>>>> +	if (!buf)
>>>> +		return NULL;
>>>> +
>>>> +	*total_len = len;
>>>> +	vsock->rx_buf_nr--;
>>>> +
>>>> +	if (unlikely(len < vsock_hlen)) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>> +
>>>> +	pkt = buf;
>>>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>>>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>
>>> So everything just stops going, and host and user don't even
>>> know what the reason is. And not only that - the next
>>> packet will be corrupted because we skipped the first one.
>>>
>>>
>>
>> I understand this case will not encountered unless the code has
>> *BUG*, like Host send some problematic packages (shorten/longer than
>> expected). In this case, I think we should ignore/drop these packets.
> 
> If there's a specific packet length expected, e.g.  in this case vector
> size, it needs to be negotiated between host and guest in some way.
> 

Actually, I still keep the negotiated in the virtio_transport_rx_work(),
in the receive_mergeable() I will calculate total len, and compared with
the pkt->hdr.len(host packed).

>>>
>>>> +
>>>> +	/* Initialize pkt residual structure */
>>>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>>>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>>>> +
>>>> +	pkt->mergeable = true;
>>>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>>>> +	if (!pkt->len)
>>>> +		return pkt;
>>>> +
>>>> +	len -= vsock_hlen;
>>>> +	if (len) {
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		/* Shared page with pkt, so get page in advance */
>>>> +		get_page(virt_to_head_page(buf));
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	while (--num_buf) {
>>>> +		buf = virtqueue_get_buf(vq, &len);
>>>> +		if (!buf)
>>>> +			goto err;
>>>> +
>>>> +		*total_len += len;
>>>> +		vsock->rx_buf_nr--;
>>>> +
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	return pkt;
>>>> +err:
>>>> +	virtio_transport_free_pkt(pkt);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>>  static void virtio_transport_rx_work(struct work_struct *work)
>>>>  {
>>>>  	struct virtio_vsock *vsock =
>>>>  		container_of(work, struct virtio_vsock, rx_work);
>>>>  	struct virtqueue *vq;
>>>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
>>>> +			sizeof(struct virtio_vsock_hdr);
>>>>
>>>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>>>
>>>> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>>>>  				goto out;
>>>>  			}
>>>>
>>>> -			pkt = virtqueue_get_buf(vq, &len);
>>>> -			if (!pkt) {
>>>> -				break;
>>>> -			}
>>>> +			if (likely(vsock->mergeable)) {
>>>> +				pkt = receive_mergeable(vq, vsock, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>> +			} else {
>>>> +				pkt = virtqueue_get_buf(vq, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>>
>>>
>>> So looking at it, this seems to be the main source of the gain.
>>> But why does this require host/guest changes?
>>>
>>>
>>> The way I see it:
>>> 	- get a buffer and create an skb
>>> 	- get the next one, check header matches, if yes
>>> 	  tack it on the skb as a fragment. If not then
>>> 	  don't, deliver previous one and queue the new one.
>>>
>>>
>>
>> Vhost change reason I explain as above, and I hope use kvec
>> to instead buf, after all buf only can express a contiguous
>> physical memory.
>>
>> Thanks,
>> Yiwen.
> 
> 
> I got the reason but I am not yet convinced it's a good one.
> You don't necessarily need host to skip headers
> in all but the first chunk. Guest can do this just as well.
> 
> 

I understand what you mean, right, only waste some memory, but it
can gain more convenient, like I don't need to skip headers and
host don't need to transport total pkt. I will fix it in the later
version.

Thanks,
Yiwen.

> 
>>>
>>>> -			vsock->rx_buf_nr--;
>>>> +				vsock->rx_buf_nr--;
>>>> +			}
>>>>
>>>>  			/* Drop short/long packets */
>>>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>>>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>>>> +			if (unlikely(len < vsock_hlen ||
>>>> +				     len > vsock_hlen + pkt->len)) {
>>>>  				virtio_transport_free_pkt(pkt);
>>>>  				continue;
>>>>  			}
>>>>
>>>> -			pkt->len = len - sizeof(pkt->hdr);
>>>> +			pkt->len = len - vsock_hlen;
>>>>  			virtio_transport_deliver_tap_pkt(pkt);
>>>>  			virtio_transport_recv_pkt(pkt);
>>>>  		}
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>> index 3ae3a33..123a8b6 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  {
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	int err;
>>>> +	void *buf = NULL;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	pkt->vsk		= info->vsk;
>>>>
>>>>  	if (info->msg && len > 0) {
>>>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>> -		if (!pkt->buf)
>>>> +		buf = kmalloc(len, GFP_KERNEL);
>>>> +		if (!buf)
>>>>  			goto out_pkt;
>>>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>> +		err = memcpy_from_msg(buf, info->msg, len);
>>>>  		if (err)
>>>>  			goto out;
>>>> +
>>>> +		pkt->vec[0].iov_base = buf;
>>>> +		pkt->vec[0].iov_len = len;
>>>> +		pkt->nr_vecs = 1;
>>>>  	}
>>>>
>>>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	return pkt;
>>>>
>>>>  out:
>>>> -	kfree(pkt->buf);
>>>> +	kfree(buf);
>>>>  out_pkt:
>>>>  	kfree(pkt);
>>>>  	return NULL;
>>>> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	struct virtio_vsock_pkt *pkt = opaque;
>>>>  	struct af_vsockmon_hdr *hdr;
>>>>  	struct sk_buff *skb;
>>>> +	int i;
>>>>
>>>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>>>>  			GFP_ATOMIC);
>>>> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>>>
>>>>  	if (pkt->len) {
>>>> -		skb_put_data(skb, pkt->buf, pkt->len);
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>>>>  	}
>>>>
>>>>  	return skb;
>>>> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>
>>>>  	spin_lock_bh(&vvs->rx_lock);
>>>>  	while (total < len && !list_empty(&vvs->rx_queue)) {
>>>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>>>> +		int i;
>>>> +
>>>>  		pkt = list_first_entry(&vvs->rx_queue,
>>>>  				       struct virtio_vsock_pkt, list);
>>>>
>>>> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>  		 */
>>>>  		spin_unlock_bh(&vvs->rx_lock);
>>>>
>>>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>>>> -		if (err)
>>>> -			goto out;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
>>>> +				last_vec_total += pkt->vec[i].iov_len;
>>>> +				continue;
>>>> +			}
>>>> +
>>>> +			vec_off = pkt->off - last_vec_total;
>>>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>>>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>>>> +					copy_bytes);
>>>> +			if (err)
>>>> +				goto out;
>>>> +
>>>> +			bytes -= copy_bytes;
>>>> +			pkt->off += copy_bytes;
>>>> +			total += copy_bytes;
>>>> +			last_vec_total += pkt->vec[i].iov_len;
>>>> +			if (!bytes)
>>>> +				break;
>>>> +		}
>>>>
>>>>  		spin_lock_bh(&vvs->rx_lock);
>>>> -
>>>> -		total += bytes;
>>>> -		pkt->off += bytes;
>>>>  		if (pkt->off == pkt->len) {
>>>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>>>  			list_del(&pkt->list);
>>>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>>>>
>>>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>>>  {
>>>> -	kfree(pkt->buf);
>>>> -	kfree(pkt);
>>>> +	int i;
>>>> +
>>>> +	if (pkt->mergeable) {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>>>> +		put_page(virt_to_head_page((void *)pkt));
>>>> +	} else {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			kfree(pkt->vec[i].iov_base);
>>>> +		kfree(pkt);
>>>> +	}
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>>>
>>>> -- 
>>>> 1.8.3.1
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13 14:29     ` Michael S. Tsirkin
@ 2018-12-14  8:11       ` jiangyiwen
  2018-12-14  8:11       ` jiangyiwen
  1 sibling, 0 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-14  8:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization

On 2018/12/13 22:29, Michael S. Tsirkin wrote:
> On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
>> Hi Michael,
>>
>> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
>>> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>>>> Guest receive mergeable rx buffer, it can merge
>>>> scatter rx buffer into a big buffer and then copy
>>>> to user space.
>>>>
>>>> In addition, it also use iovec to replace buf in struct
>>>> virtio_vsock_pkt, keep tx and rx consistency. The only
>>>> difference is now tx still uses a segment of continuous
>>>> physical memory to implement.
>>>>
>>>> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
>>>> ---
>>>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>>>  include/linux/virtio_vsock.h            |   6 +-
>>>>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>>>>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>>>>  4 files changed, 166 insertions(+), 35 deletions(-)
>>>
>>>
>>> This was supposed to be a guest patch, why is vhost changed here?
>>>
>>
>> In mergeable rx buff cases, it need to scatter big packets into several
>> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
>> time, in order to keep tx and rx consistency, I use kvec to replace
>> variable buf, because vhost use the variable pkt->buf, so this patch
>> caused vhost is changed.
> 
> You'd want to split these patches imho.
> 

Ok, I should do it.

>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index dc52b0f..c7ab0dd 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  		size_t nbytes;
>>>>  		size_t len;
>>>>  		s16 headcount;
>>>> +		size_t remain_len;
>>>> +		int i;
>>>>
>>>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>>>  		if (list_empty(&vsock->send_pkt_list)) {
>>>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  			break;
>>>>  		}
>>>>
>>>> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>>>> -		if (nbytes != pkt->len) {
>>>> -			virtio_transport_free_pkt(pkt);
>>>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>>>> -			break;
>>>> +		remain_len = pkt->len;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			int tmp_len;
>>>> +
>>>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>>>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
>>>> +			if (nbytes != tmp_len) {
>>>> +				virtio_transport_free_pkt(pkt);
>>>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>>>> +				break;
>>>> +			}
>>>> +
>>>> +			remain_len -= tmp_len;
>>>>  		}
>>>>
>>>>  		vhost_add_used_n(vq, vq->heads, headcount);
>>>> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  	struct iov_iter iov_iter;
>>>>  	size_t nbytes;
>>>>  	size_t len;
>>>> +	void *buf;
>>>>
>>>>  	if (in != 0) {
>>>>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
>>>> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> +	if (!buf) {
>>>>  		kfree(pkt);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>> +	pkt->vec[0].iov_base = buf;
>>>> +	pkt->vec[0].iov_len = pkt->len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>>>  	if (nbytes != pkt->len) {
>>>>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>>>  		       pkt->len, nbytes);
>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>> index da9e1fe..734eeed 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -13,6 +13,8 @@
>>>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>>> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>>
>>>>  /* Virtio-vsock feature */
>>>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>>>  	struct list_head list;
>>>>  	/* socket refcnt not held, only use for cancellation */
>>>>  	struct vsock_sock *vsk;
>>>> -	void *buf;
>>>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>>>> +	int nr_vecs;
>>>>  	u32 len;
>>>>  	u32 off;
>>>>  	bool reply;
>>>> +	bool mergeable;
>>>>  };
>>>>
>>>>  struct virtio_vsock_pkt_info {
>>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>>> index c4a465c..148b58a 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>>>
>>>>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  		sgs[out_sg++] = &hdr;
>>>> -		if (pkt->buf) {
>>>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>>>> +		if (pkt->len) {
>>>> +			/* Currently only support a segment of memory in tx */
>>>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>>>> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>>>>  			sgs[out_sg++] = &buf;
>>>>  		}
>>>>
>>>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	struct scatterlist hdr, buf, *sgs[2];
>>>>  	int ret;
>>>> +	void *pkt_buf;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>>  		return -ENOMEM;
>>>>
>>>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>>>> +	if (!pkt_buf) {
>>>>  		virtio_transport_free_pkt(pkt);
>>>>  		return -ENOMEM;
>>>>  	}
>>>>
>>>> +	pkt->vec[0].iov_base = pkt_buf;
>>>> +	pkt->vec[0].iov_len = buf_len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>>  	pkt->len = buf_len;
>>>>
>>>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  	sgs[0] = &hdr;
>>>>
>>>> -	sg_init_one(&buf, pkt->buf, buf_len);
>>>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>>>  	sgs[1] = &buf;
>>>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>>>  	if (ret)
>>>> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>>>>  	return val < virtqueue_get_vring_size(vq);
>>>>  }
>>>>
>>>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>>>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>>>> +{
>>>> +	struct virtio_vsock_pkt *pkt;
>>>> +	u16 num_buf;
>>>> +	void *buf;
>>>> +	unsigned int len;
>>>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>>>> +
>>>> +	buf = virtqueue_get_buf(vq, &len);
>>>> +	if (!buf)
>>>> +		return NULL;
>>>> +
>>>> +	*total_len = len;
>>>> +	vsock->rx_buf_nr--;
>>>> +
>>>> +	if (unlikely(len < vsock_hlen)) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>> +
>>>> +	pkt = buf;
>>>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>>>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>
>>> So everything just stops going, and host and user don't even
>>> know what the reason is. And not only that - the next
>>> packet will be corrupted because we skipped the first one.
>>>
>>>
>>
>> I understand this case will not encountered unless the code has
>> *BUG*, like Host send some problematic packages (shorten/longer than
>> expected). In this case, I think we should ignore/drop these packets.
> 
> If there's a specific packet length expected, e.g.  in this case vector
> size, it needs to be negotiated between host and guest in some way.
> 

Actually, I still keep the negotiated in the virtio_transport_rx_work(),
in the receive_mergeable() I will calculate total len, and compared with
the pkt->hdr.len(host packed).

>>>
>>>> +
>>>> +	/* Initialize pkt residual structure */
>>>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>>>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>>>> +
>>>> +	pkt->mergeable = true;
>>>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>>>> +	if (!pkt->len)
>>>> +		return pkt;
>>>> +
>>>> +	len -= vsock_hlen;
>>>> +	if (len) {
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		/* Shared page with pkt, so get page in advance */
>>>> +		get_page(virt_to_head_page(buf));
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	while (--num_buf) {
>>>> +		buf = virtqueue_get_buf(vq, &len);
>>>> +		if (!buf)
>>>> +			goto err;
>>>> +
>>>> +		*total_len += len;
>>>> +		vsock->rx_buf_nr--;
>>>> +
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	return pkt;
>>>> +err:
>>>> +	virtio_transport_free_pkt(pkt);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>>  static void virtio_transport_rx_work(struct work_struct *work)
>>>>  {
>>>>  	struct virtio_vsock *vsock =
>>>>  		container_of(work, struct virtio_vsock, rx_work);
>>>>  	struct virtqueue *vq;
>>>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
>>>> +			sizeof(struct virtio_vsock_hdr);
>>>>
>>>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>>>
>>>> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>>>>  				goto out;
>>>>  			}
>>>>
>>>> -			pkt = virtqueue_get_buf(vq, &len);
>>>> -			if (!pkt) {
>>>> -				break;
>>>> -			}
>>>> +			if (likely(vsock->mergeable)) {
>>>> +				pkt = receive_mergeable(vq, vsock, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>> +			} else {
>>>> +				pkt = virtqueue_get_buf(vq, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>>
>>>
>>> So looking at it, this seems to be the main source of the gain.
>>> But why does this require host/guest changes?
>>>
>>>
>>> The way I see it:
>>> 	- get a buffer and create an skb
>>> 	- get the next one, check header matches, if yes
>>> 	  tack it on the skb as a fragment. If not then
>>> 	  don't, deliver previous one and queue the new one.
>>>
>>>
>>
>> Vhost change reason I explain as above, and I hope use kvec
>> to instead buf, after all buf only can express a contiguous
>> physical memory.
>>
>> Thanks,
>> Yiwen.
> 
> 
> I got the reason but I am not yet convinced it's a good one.
> You don't necessarily need host to skip headers
> in all but the first chunk. Guest can do this just as well.
> 
> 

I understand what you mean, right, only waste some memory, but it
can gain more convenient, like I don't need to skip headers and
host don't need to transport total pkt. I will fix it in the later
version.

Thanks,
Yiwen.

> 
>>>
>>>> -			vsock->rx_buf_nr--;
>>>> +				vsock->rx_buf_nr--;
>>>> +			}
>>>>
>>>>  			/* Drop short/long packets */
>>>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>>>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>>>> +			if (unlikely(len < vsock_hlen ||
>>>> +				     len > vsock_hlen + pkt->len)) {
>>>>  				virtio_transport_free_pkt(pkt);
>>>>  				continue;
>>>>  			}
>>>>
>>>> -			pkt->len = len - sizeof(pkt->hdr);
>>>> +			pkt->len = len - vsock_hlen;
>>>>  			virtio_transport_deliver_tap_pkt(pkt);
>>>>  			virtio_transport_recv_pkt(pkt);
>>>>  		}
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>> index 3ae3a33..123a8b6 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  {
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	int err;
>>>> +	void *buf = NULL;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	pkt->vsk		= info->vsk;
>>>>
>>>>  	if (info->msg && len > 0) {
>>>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>> -		if (!pkt->buf)
>>>> +		buf = kmalloc(len, GFP_KERNEL);
>>>> +		if (!buf)
>>>>  			goto out_pkt;
>>>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>> +		err = memcpy_from_msg(buf, info->msg, len);
>>>>  		if (err)
>>>>  			goto out;
>>>> +
>>>> +		pkt->vec[0].iov_base = buf;
>>>> +		pkt->vec[0].iov_len = len;
>>>> +		pkt->nr_vecs = 1;
>>>>  	}
>>>>
>>>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	return pkt;
>>>>
>>>>  out:
>>>> -	kfree(pkt->buf);
>>>> +	kfree(buf);
>>>>  out_pkt:
>>>>  	kfree(pkt);
>>>>  	return NULL;
>>>> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	struct virtio_vsock_pkt *pkt = opaque;
>>>>  	struct af_vsockmon_hdr *hdr;
>>>>  	struct sk_buff *skb;
>>>> +	int i;
>>>>
>>>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>>>>  			GFP_ATOMIC);
>>>> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>>>
>>>>  	if (pkt->len) {
>>>> -		skb_put_data(skb, pkt->buf, pkt->len);
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>>>>  	}
>>>>
>>>>  	return skb;
>>>> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>
>>>>  	spin_lock_bh(&vvs->rx_lock);
>>>>  	while (total < len && !list_empty(&vvs->rx_queue)) {
>>>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>>>> +		int i;
>>>> +
>>>>  		pkt = list_first_entry(&vvs->rx_queue,
>>>>  				       struct virtio_vsock_pkt, list);
>>>>
>>>> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>  		 */
>>>>  		spin_unlock_bh(&vvs->rx_lock);
>>>>
>>>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>>>> -		if (err)
>>>> -			goto out;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
>>>> +				last_vec_total += pkt->vec[i].iov_len;
>>>> +				continue;
>>>> +			}
>>>> +
>>>> +			vec_off = pkt->off - last_vec_total;
>>>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>>>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>>>> +					copy_bytes);
>>>> +			if (err)
>>>> +				goto out;
>>>> +
>>>> +			bytes -= copy_bytes;
>>>> +			pkt->off += copy_bytes;
>>>> +			total += copy_bytes;
>>>> +			last_vec_total += pkt->vec[i].iov_len;
>>>> +			if (!bytes)
>>>> +				break;
>>>> +		}
>>>>
>>>>  		spin_lock_bh(&vvs->rx_lock);
>>>> -
>>>> -		total += bytes;
>>>> -		pkt->off += bytes;
>>>>  		if (pkt->off == pkt->len) {
>>>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>>>  			list_del(&pkt->list);
>>>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>>>>
>>>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>>>  {
>>>> -	kfree(pkt->buf);
>>>> -	kfree(pkt);
>>>> +	int i;
>>>> +
>>>> +	if (pkt->mergeable) {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>>>> +		put_page(virt_to_head_page((void *)pkt));
>>>> +	} else {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			kfree(pkt->vec[i].iov_base);
>>>> +		kfree(pkt);
>>>> +	}
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>>>
>>>> -- 
>>>> 1.8.3.1
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13 16:20 ` Stefan Hajnoczi
  2018-12-14  8:41   ` jiangyiwen
@ 2018-12-14  8:41   ` jiangyiwen
  1 sibling, 0 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-14  8:41 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Michael S. Tsirkin, Jason Wang, netdev, kvm, virtualization

On 2018/12/14 0:20, Stefan Hajnoczi wrote:
> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>> +{
>> +	struct virtio_vsock_pkt *pkt;
>> +	u16 num_buf;
>> +	void *buf;
>> +	unsigned int len;
>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>> +
>> +	buf = virtqueue_get_buf(vq, &len);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	*total_len = len;
>> +	vsock->rx_buf_nr--;
>> +
>> +	if (unlikely(len < vsock_hlen)) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	pkt = buf;
>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	/* Initialize pkt residual structure */
>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> 
> struct virtio_vsock_pkt is an internal driver state structure.  It must
> be able to change without breaking the host<->guest device interface.
> Exposing struct virtio_vsock_pkt across the device interface makes this
> impossible.
> 
> One way to solve this is by placing a header length field at the
> beginning of the mergeable buffer.  That way the device knows at which
> offset the payload should be placed and the driver can continue to use
> the allocation scheme you've chosen (where a single page contains the
> virtio_vsock_pkt and payload).
> 
> Another issue is that virtio requires exclusive read OR write
> descriptors.  You cannot have a descriptor that is both read and write.
> So there's not a huge advantage to combining the driver state struct and
> payload, except maybe the driver can save a memory allocation.
> 

Right, I will separate pkt and payload in different pages, as you mentioned
in another patch.

Thanks,
Yiwen.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
  2018-12-13 16:20 ` Stefan Hajnoczi
@ 2018-12-14  8:41   ` jiangyiwen
  2018-12-14  8:41   ` jiangyiwen
  1 sibling, 0 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-14  8:41 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev, virtualization, kvm, Michael S. Tsirkin

On 2018/12/14 0:20, Stefan Hajnoczi wrote:
> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>> +{
>> +	struct virtio_vsock_pkt *pkt;
>> +	u16 num_buf;
>> +	void *buf;
>> +	unsigned int len;
>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>> +
>> +	buf = virtqueue_get_buf(vq, &len);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	*total_len = len;
>> +	vsock->rx_buf_nr--;
>> +
>> +	if (unlikely(len < vsock_hlen)) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	pkt = buf;
>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	/* Initialize pkt residual structure */
>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> 
> struct virtio_vsock_pkt is an internal driver state structure.  It must
> be able to change without breaking the host<->guest device interface.
> Exposing struct virtio_vsock_pkt across the device interface makes this
> impossible.
> 
> One way to solve this is by placing a header length field at the
> beginning of the mergeable buffer.  That way the device knows at which
> offset the payload should be placed and the driver can continue to use
> the allocation scheme you've chosen (where a single page contains the
> virtio_vsock_pkt and payload).
> 
> Another issue is that virtio requires exclusive read OR write
> descriptors.  You cannot have a descriptor that is both read and write.
> So there's not a huge advantage to combining the driver state struct and
> payload, except maybe the driver can save a memory allocation.
> 

Right, I will separate pkt and payload in different pages, as you mentioned
in another patch.

Thanks,
Yiwen.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
@ 2018-12-12  9:31 jiangyiwen
  0 siblings, 0 replies; 13+ messages in thread
From: jiangyiwen @ 2018-12-12  9:31 UTC (permalink / raw)
  To: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang
  Cc: netdev, kvm, virtualization

Guest receive mergeable rx buffer, it can merge
scatter rx buffer into a big buffer and then copy
to user space.

In addition, it also use iovec to replace buf in struct
virtio_vsock_pkt, keep tx and rx consistency. The only
difference is now tx still uses a segment of continuous
physical memory to implement.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 drivers/vhost/vsock.c                   |  31 +++++++---
 include/linux/virtio_vsock.h            |   6 +-
 net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
 net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
 4 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index dc52b0f..c7ab0dd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		size_t nbytes;
 		size_t len;
 		s16 headcount;
+		size_t remain_len;
+		int i;

 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			break;
 		}

-		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
-		if (nbytes != pkt->len) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Faulted on copying pkt buf\n");
-			break;
+		remain_len = pkt->len;
+		for (i = 0; i < pkt->nr_vecs; i++) {
+			int tmp_len;
+
+			tmp_len = min(remain_len, pkt->vec[i].iov_len);
+			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
+			if (nbytes != tmp_len) {
+				virtio_transport_free_pkt(pkt);
+				vq_err(vq, "Faulted on copying pkt buf\n");
+				break;
+			}
+
+			remain_len -= tmp_len;
 		}

 		vhost_add_used_n(vq, vq->heads, headcount);
@@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
 	struct iov_iter iov_iter;
 	size_t nbytes;
 	size_t len;
+	void *buf;

 	if (in != 0) {
 		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
@@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
 		return NULL;
 	}

-	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
-	if (!pkt->buf) {
+	buf = kmalloc(pkt->len, GFP_KERNEL);
+	if (!buf) {
 		kfree(pkt);
 		return NULL;
 	}

-	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+	pkt->vec[0].iov_base = buf;
+	pkt->vec[0].iov_len = pkt->len;
+	pkt->nr_vecs = 1;
+
+	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
 	if (nbytes != pkt->len) {
 		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
 		       pkt->len, nbytes);
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index da9e1fe..734eeed 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -13,6 +13,8 @@
 #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
 #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
+#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)

 /* Virtio-vsock feature */
 #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
@@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
 	struct list_head list;
 	/* socket refcnt not held, only use for cancellation */
 	struct vsock_sock *vsk;
-	void *buf;
+	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
+	int nr_vecs;
 	u32 len;
 	u32 off;
 	bool reply;
+	bool mergeable;
 };

 struct virtio_vsock_pkt_info {
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index c4a465c..148b58a 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,

 		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
 		sgs[out_sg++] = &hdr;
-		if (pkt->buf) {
-			sg_init_one(&buf, pkt->buf, pkt->len);
+		if (pkt->len) {
+			/* Currently only support a segment of memory in tx */
+			BUG_ON(pkt->vec[0].iov_len != pkt->len);
+			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
 			sgs[out_sg++] = &buf;
 		}

@@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
 	struct virtio_vsock_pkt *pkt;
 	struct scatterlist hdr, buf, *sgs[2];
 	int ret;
+	void *pkt_buf;

 	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
 	if (!pkt)
 		return -ENOMEM;

-	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
-	if (!pkt->buf) {
+	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
+	if (!pkt_buf) {
 		virtio_transport_free_pkt(pkt);
 		return -ENOMEM;
 	}

+	pkt->vec[0].iov_base = pkt_buf;
+	pkt->vec[0].iov_len = buf_len;
+	pkt->nr_vecs = 1;
+
 	pkt->len = buf_len;

 	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
 	sgs[0] = &hdr;

-	sg_init_one(&buf, pkt->buf, buf_len);
+	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
 	sgs[1] = &buf;
 	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
 	if (ret)
@@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
 	return val < virtqueue_get_vring_size(vq);
 }

+static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
+		struct virtio_vsock *vsock, unsigned int *total_len)
+{
+	struct virtio_vsock_pkt *pkt;
+	u16 num_buf;
+	void *buf;
+	unsigned int len;
+	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
+
+	buf = virtqueue_get_buf(vq, &len);
+	if (!buf)
+		return NULL;
+
+	*total_len = len;
+	vsock->rx_buf_nr--;
+
+	if (unlikely(len < vsock_hlen)) {
+		put_page(virt_to_head_page(buf));
+		return NULL;
+	}
+
+	pkt = buf;
+	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
+	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
+		put_page(virt_to_head_page(buf));
+		return NULL;
+	}
+
+	/* Initialize pkt residual structure */
+	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
+			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
+
+	pkt->mergeable = true;
+	pkt->len = le32_to_cpu(pkt->hdr.len);
+	if (!pkt->len)
+		return pkt;
+
+	len -= vsock_hlen;
+	if (len) {
+		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
+		pkt->vec[pkt->nr_vecs].iov_len = len;
+		/* Shared page with pkt, so get page in advance */
+		get_page(virt_to_head_page(buf));
+		pkt->nr_vecs++;
+	}
+
+	while (--num_buf) {
+		buf = virtqueue_get_buf(vq, &len);
+		if (!buf)
+			goto err;
+
+		*total_len += len;
+		vsock->rx_buf_nr--;
+
+		pkt->vec[pkt->nr_vecs].iov_base = buf;
+		pkt->vec[pkt->nr_vecs].iov_len = len;
+		pkt->nr_vecs++;
+	}
+
+	return pkt;
+err:
+	virtio_transport_free_pkt(pkt);
+	return NULL;
+}
+
 static void virtio_transport_rx_work(struct work_struct *work)
 {
 	struct virtio_vsock *vsock =
 		container_of(work, struct virtio_vsock, rx_work);
 	struct virtqueue *vq;
+	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
+			sizeof(struct virtio_vsock_hdr);

 	vq = vsock->vqs[VSOCK_VQ_RX];

@@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
 				goto out;
 			}

-			pkt = virtqueue_get_buf(vq, &len);
-			if (!pkt) {
-				break;
-			}
+			if (likely(vsock->mergeable)) {
+				pkt = receive_mergeable(vq, vsock, &len);
+				if (!pkt)
+					break;
+			} else {
+				pkt = virtqueue_get_buf(vq, &len);
+				if (!pkt)
+					break;

-			vsock->rx_buf_nr--;
+				vsock->rx_buf_nr--;
+			}

 			/* Drop short/long packets */
-			if (unlikely(len < sizeof(pkt->hdr) ||
-				     len > sizeof(pkt->hdr) + pkt->len)) {
+			if (unlikely(len < vsock_hlen ||
+				     len > vsock_hlen + pkt->len)) {
 				virtio_transport_free_pkt(pkt);
 				continue;
 			}

-			pkt->len = len - sizeof(pkt->hdr);
+			pkt->len = len - vsock_hlen;
 			virtio_transport_deliver_tap_pkt(pkt);
 			virtio_transport_recv_pkt(pkt);
 		}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3ae3a33..123a8b6 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 {
 	struct virtio_vsock_pkt *pkt;
 	int err;
+	void *buf = NULL;

 	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
 	if (!pkt)
@@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 	pkt->vsk		= info->vsk;

 	if (info->msg && len > 0) {
-		pkt->buf = kmalloc(len, GFP_KERNEL);
-		if (!pkt->buf)
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf)
 			goto out_pkt;
-		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		err = memcpy_from_msg(buf, info->msg, len);
 		if (err)
 			goto out;
+
+		pkt->vec[0].iov_base = buf;
+		pkt->vec[0].iov_len = len;
+		pkt->nr_vecs = 1;
 	}

 	trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
 	return pkt;

 out:
-	kfree(pkt->buf);
+	kfree(buf);
 out_pkt:
 	kfree(pkt);
 	return NULL;
@@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
 	struct virtio_vsock_pkt *pkt = opaque;
 	struct af_vsockmon_hdr *hdr;
 	struct sk_buff *skb;
+	int i;

 	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
 			GFP_ATOMIC);
@@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
 	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));

 	if (pkt->len) {
-		skb_put_data(skb, pkt->buf, pkt->len);
+		for (i = 0; i < pkt->nr_vecs; i++)
+			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
 	}

 	return skb;
@@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,

 	spin_lock_bh(&vvs->rx_lock);
 	while (total < len && !list_empty(&vvs->rx_queue)) {
+		size_t copy_bytes, last_vec_total = 0, vec_off;
+		int i;
+
 		pkt = list_first_entry(&vvs->rx_queue,
 				       struct virtio_vsock_pkt, list);

@@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
 		 */
 		spin_unlock_bh(&vvs->rx_lock);

-		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
-		if (err)
-			goto out;
+		for (i = 0; i < pkt->nr_vecs; i++) {
+			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
+				last_vec_total += pkt->vec[i].iov_len;
+				continue;
+			}
+
+			vec_off = pkt->off - last_vec_total;
+			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
+			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
+					copy_bytes);
+			if (err)
+				goto out;
+
+			bytes -= copy_bytes;
+			pkt->off += copy_bytes;
+			total += copy_bytes;
+			last_vec_total += pkt->vec[i].iov_len;
+			if (!bytes)
+				break;
+		}

 		spin_lock_bh(&vvs->rx_lock);
-
-		total += bytes;
-		pkt->off += bytes;
 		if (pkt->off == pkt->len) {
 			virtio_transport_dec_rx_pkt(vvs, pkt);
 			list_del(&pkt->list);
@@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)

 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
 {
-	kfree(pkt->buf);
-	kfree(pkt);
+	int i;
+
+	if (pkt->mergeable) {
+		for (i = 0; i < pkt->nr_vecs; i++)
+			put_page(virt_to_head_page(pkt->vec[i].iov_base));
+		put_page(virt_to_head_page((void *)pkt));
+	} else {
+		for (i = 0; i < pkt->nr_vecs; i++)
+			kfree(pkt->vec[i].iov_base);
+		kfree(pkt);
+	}
 }
 EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);

-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-12-14  8:41 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
2018-12-12 15:31 ` Michael S. Tsirkin
2018-12-13  2:38   ` jiangyiwen
2018-12-13 14:29     ` Michael S. Tsirkin
2018-12-13 14:29     ` Michael S. Tsirkin
2018-12-14  8:11       ` jiangyiwen
2018-12-14  8:11       ` jiangyiwen
2018-12-12 15:31 ` Michael S. Tsirkin
2018-12-13 16:20 ` Stefan Hajnoczi
2018-12-14  8:41   ` jiangyiwen
2018-12-14  8:41   ` jiangyiwen
2018-12-13 16:20 ` Stefan Hajnoczi
2018-12-12  9:31 jiangyiwen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.