All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: jiangyiwen <jiangyiwen@huawei.com>
Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	Stefan Hajnoczi <stefanha@redhat.com>,
	virtualization@lists.linux-foundation.org
Subject: Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
Date: Thu, 13 Dec 2018 09:29:10 -0500	[thread overview]
Message-ID: <20181213092619-mutt-send-email-mst__31825.9296509952$1544711271$gmane$org@kernel.org> (raw)
In-Reply-To: <5C11C611.40901@huawei.com>

On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
> Hi Michael,
> 
> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> >> Guest receive mergeable rx buffer, it can merge
> >> scatter rx buffer into a big buffer and then copy
> >> to user space.
> >>
> >> In addition, it also use iovec to replace buf in struct
> >> virtio_vsock_pkt, keep tx and rx consistency. The only
> >> difference is now tx still uses a segment of continuous
> >> physical memory to implement.
> >>
> >> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> >> ---
> >>  drivers/vhost/vsock.c                   |  31 +++++++---
> >>  include/linux/virtio_vsock.h            |   6 +-
> >>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
> >>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
> >>  4 files changed, 166 insertions(+), 35 deletions(-)
> > 
> > 
> > This was supposed to be a guest patch, why is vhost changed here?
> > 
> 
> In mergeable rx buff cases, it need to scatter big packets into several
> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
> time, in order to keep tx and rx consistency, I use kvec to replace
> variable buf, because vhost use the variable pkt->buf, so this patch
> caused vhost is changed.

You'd want to split these patches imho.

> >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> >> index dc52b0f..c7ab0dd 100644
> >> --- a/drivers/vhost/vsock.c
> >> +++ b/drivers/vhost/vsock.c
> >> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  		size_t nbytes;
> >>  		size_t len;
> >>  		s16 headcount;
> >> +		size_t remain_len;
> >> +		int i;
> >>
> >>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> >>  		if (list_empty(&vsock->send_pkt_list)) {
> >> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >>  			break;
> >>  		}
> >>
> >> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> >> -		if (nbytes != pkt->len) {
> >> -			virtio_transport_free_pkt(pkt);
> >> -			vq_err(vq, "Faulted on copying pkt buf\n");
> >> -			break;
> >> +		remain_len = pkt->len;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			int tmp_len;
> >> +
> >> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> >> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
> >> +			if (nbytes != tmp_len) {
> >> +				virtio_transport_free_pkt(pkt);
> >> +				vq_err(vq, "Faulted on copying pkt buf\n");
> >> +				break;
> >> +			}
> >> +
> >> +			remain_len -= tmp_len;
> >>  		}
> >>
> >>  		vhost_add_used_n(vq, vq->heads, headcount);
> >> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  	struct iov_iter iov_iter;
> >>  	size_t nbytes;
> >>  	size_t len;
> >> +	void *buf;
> >>
> >>  	if (in != 0) {
> >>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> >> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  		return NULL;
> >>  	}
> >>
> >> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> >> +	if (!buf) {
> >>  		kfree(pkt);
> >>  		return NULL;
> >>  	}
> >>
> >> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> >> +	pkt->vec[0].iov_base = buf;
> >> +	pkt->vec[0].iov_len = pkt->len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
> >>  	if (nbytes != pkt->len) {
> >>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> >>  		       pkt->len, nbytes);
> >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> >> index da9e1fe..734eeed 100644
> >> --- a/include/linux/virtio_vsock.h
> >> +++ b/include/linux/virtio_vsock.h
> >> @@ -13,6 +13,8 @@
> >>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
> >>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
> >>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> >> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> >> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
> >>
> >>  /* Virtio-vsock feature */
> >>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> >> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
> >>  	struct list_head list;
> >>  	/* socket refcnt not held, only use for cancellation */
> >>  	struct vsock_sock *vsk;
> >> -	void *buf;
> >> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> >> +	int nr_vecs;
> >>  	u32 len;
> >>  	u32 off;
> >>  	bool reply;
> >> +	bool mergeable;
> >>  };
> >>
> >>  struct virtio_vsock_pkt_info {
> >> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> >> index c4a465c..148b58a 100644
> >> --- a/net/vmw_vsock/virtio_transport.c
> >> +++ b/net/vmw_vsock/virtio_transport.c
> >> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> >>
> >>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  		sgs[out_sg++] = &hdr;
> >> -		if (pkt->buf) {
> >> -			sg_init_one(&buf, pkt->buf, pkt->len);
> >> +		if (pkt->len) {
> >> +			/* Currently only support a segment of memory in tx */
> >> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> >> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
> >>  			sgs[out_sg++] = &buf;
> >>  		}
> >>
> >> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	struct scatterlist hdr, buf, *sgs[2];
> >>  	int ret;
> >> +	void *pkt_buf;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >>  		return -ENOMEM;
> >>
> >> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> >> +	if (!pkt_buf) {
> >>  		virtio_transport_free_pkt(pkt);
> >>  		return -ENOMEM;
> >>  	}
> >>
> >> +	pkt->vec[0].iov_base = pkt_buf;
> >> +	pkt->vec[0].iov_len = buf_len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >>  	pkt->len = buf_len;
> >>
> >>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  	sgs[0] = &hdr;
> >>
> >> -	sg_init_one(&buf, pkt->buf, buf_len);
> >> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
> >>  	sgs[1] = &buf;
> >>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
> >>  	if (ret)
> >> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
> >>  	return val < virtqueue_get_vring_size(vq);
> >>  }
> >>
> >> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> >> +		struct virtio_vsock *vsock, unsigned int *total_len)
> >> +{
> >> +	struct virtio_vsock_pkt *pkt;
> >> +	u16 num_buf;
> >> +	void *buf;
> >> +	unsigned int len;
> >> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> >> +
> >> +	buf = virtqueue_get_buf(vq, &len);
> >> +	if (!buf)
> >> +		return NULL;
> >> +
> >> +	*total_len = len;
> >> +	vsock->rx_buf_nr--;
> >> +
> >> +	if (unlikely(len < vsock_hlen)) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> >> +
> >> +	pkt = buf;
> >> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> >> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> > 
> > So everything just stops going, and host and user don't even
> > know what the reason is. And not only that - the next
> > packet will be corrupted because we skipped the first one.
> > 
> > 
> 
> I understand this case will not encountered unless the code has
> *BUG*, like Host send some problematic packages (shorten/longer than
> expected). In this case, I think we should ignore/drop these packets.

If there's a specific packet length expected, e.g.  in this case vector
size, it needs to be negotiated between host and guest in some way.

> > 
> >> +
> >> +	/* Initialize pkt residual structure */
> >> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
> >> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> >> +
> >> +	pkt->mergeable = true;
> >> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> >> +	if (!pkt->len)
> >> +		return pkt;
> >> +
> >> +	len -= vsock_hlen;
> >> +	if (len) {
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		/* Shared page with pkt, so get page in advance */
> >> +		get_page(virt_to_head_page(buf));
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	while (--num_buf) {
> >> +		buf = virtqueue_get_buf(vq, &len);
> >> +		if (!buf)
> >> +			goto err;
> >> +
> >> +		*total_len += len;
> >> +		vsock->rx_buf_nr--;
> >> +
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	return pkt;
> >> +err:
> >> +	virtio_transport_free_pkt(pkt);
> >> +	return NULL;
> >> +}
> >> +
> >>  static void virtio_transport_rx_work(struct work_struct *work)
> >>  {
> >>  	struct virtio_vsock *vsock =
> >>  		container_of(work, struct virtio_vsock, rx_work);
> >>  	struct virtqueue *vq;
> >> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> >> +			sizeof(struct virtio_vsock_hdr);
> >>
> >>  	vq = vsock->vqs[VSOCK_VQ_RX];
> >>
> >> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
> >>  				goto out;
> >>  			}
> >>
> >> -			pkt = virtqueue_get_buf(vq, &len);
> >> -			if (!pkt) {
> >> -				break;
> >> -			}
> >> +			if (likely(vsock->mergeable)) {
> >> +				pkt = receive_mergeable(vq, vsock, &len);
> >> +				if (!pkt)
> >> +					break;
> >> +			} else {
> >> +				pkt = virtqueue_get_buf(vq, &len);
> >> +				if (!pkt)
> >> +					break;
> >>
> > 
> > So looking at it, this seems to be the main source of the gain.
> > But why does this require host/guest changes?
> > 
> > 
> > The way I see it:
> > 	- get a buffer and create an skb
> > 	- get the next one, check header matches, if yes
> > 	  tack it on the skb as a fragment. If not then
> > 	  don't, deliver previous one and queue the new one.
> > 
> > 
> 
> Vhost change reason I explain as above, and I hope use kvec
> to instead buf, after all buf only can express a contiguous
> physical memory.
> 
> Thanks,
> Yiwen.


I got the reason but I am not yet convinced it's a good one.
You don't necessarily need host to skip headers
in all but the first chunk. Guest can do this just as well.



> > 
> >> -			vsock->rx_buf_nr--;
> >> +				vsock->rx_buf_nr--;
> >> +			}
> >>
> >>  			/* Drop short/long packets */
> >> -			if (unlikely(len < sizeof(pkt->hdr) ||
> >> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> >> +			if (unlikely(len < vsock_hlen ||
> >> +				     len > vsock_hlen + pkt->len)) {
> >>  				virtio_transport_free_pkt(pkt);
> >>  				continue;
> >>  			}
> >>
> >> -			pkt->len = len - sizeof(pkt->hdr);
> >> +			pkt->len = len - vsock_hlen;
> >>  			virtio_transport_deliver_tap_pkt(pkt);
> >>  			virtio_transport_recv_pkt(pkt);
> >>  		}
> >> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> >> index 3ae3a33..123a8b6 100644
> >> --- a/net/vmw_vsock/virtio_transport_common.c
> >> +++ b/net/vmw_vsock/virtio_transport_common.c
> >> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  {
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	int err;
> >> +	void *buf = NULL;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	pkt->vsk		= info->vsk;
> >>
> >>  	if (info->msg && len > 0) {
> >> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> >> -		if (!pkt->buf)
> >> +		buf = kmalloc(len, GFP_KERNEL);
> >> +		if (!buf)
> >>  			goto out_pkt;
> >> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >> +		err = memcpy_from_msg(buf, info->msg, len);
> >>  		if (err)
> >>  			goto out;
> >> +
> >> +		pkt->vec[0].iov_base = buf;
> >> +		pkt->vec[0].iov_len = len;
> >> +		pkt->nr_vecs = 1;
> >>  	}
> >>
> >>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> >> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> >>  	return pkt;
> >>
> >>  out:
> >> -	kfree(pkt->buf);
> >> +	kfree(buf);
> >>  out_pkt:
> >>  	kfree(pkt);
> >>  	return NULL;
> >> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	struct virtio_vsock_pkt *pkt = opaque;
> >>  	struct af_vsockmon_hdr *hdr;
> >>  	struct sk_buff *skb;
> >> +	int i;
> >>
> >>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
> >>  			GFP_ATOMIC);
> >> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> >>
> >>  	if (pkt->len) {
> >> -		skb_put_data(skb, pkt->buf, pkt->len);
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
> >>  	}
> >>
> >>  	return skb;
> >> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>
> >>  	spin_lock_bh(&vvs->rx_lock);
> >>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> >> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> >> +		int i;
> >> +
> >>  		pkt = list_first_entry(&vvs->rx_queue,
> >>  				       struct virtio_vsock_pkt, list);
> >>
> >> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>  		 */
> >>  		spin_unlock_bh(&vvs->rx_lock);
> >>
> >> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> >> -		if (err)
> >> -			goto out;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
> >> +				last_vec_total += pkt->vec[i].iov_len;
> >> +				continue;
> >> +			}
> >> +
> >> +			vec_off = pkt->off - last_vec_total;
> >> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> >> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> >> +					copy_bytes);
> >> +			if (err)
> >> +				goto out;
> >> +
> >> +			bytes -= copy_bytes;
> >> +			pkt->off += copy_bytes;
> >> +			total += copy_bytes;
> >> +			last_vec_total += pkt->vec[i].iov_len;
> >> +			if (!bytes)
> >> +				break;
> >> +		}
> >>
> >>  		spin_lock_bh(&vvs->rx_lock);
> >> -
> >> -		total += bytes;
> >> -		pkt->off += bytes;
> >>  		if (pkt->off == pkt->len) {
> >>  			virtio_transport_dec_rx_pkt(vvs, pkt);
> >>  			list_del(&pkt->list);
> >> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> >>
> >>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> >>  {
> >> -	kfree(pkt->buf);
> >> -	kfree(pkt);
> >> +	int i;
> >> +
> >> +	if (pkt->mergeable) {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> >> +		put_page(virt_to_head_page((void *)pkt));
> >> +	} else {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			kfree(pkt->vec[i].iov_base);
> >> +		kfree(pkt);
> >> +	}
> >>  }
> >>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> >>
> >> -- 
> >> 1.8.3.1
> >>
> > 
> > .
> > 
> 

  reply	other threads:[~2018-12-13 14:29 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-12  9:31 [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest jiangyiwen
2018-12-12 15:31 ` Michael S. Tsirkin
2018-12-13  2:38   ` jiangyiwen
2018-12-13 14:29     ` Michael S. Tsirkin [this message]
2018-12-13 14:29     ` Michael S. Tsirkin
2018-12-14  8:11       ` jiangyiwen
2018-12-14  8:11       ` jiangyiwen
2018-12-12 15:31 ` Michael S. Tsirkin
2018-12-13 16:20 ` Stefan Hajnoczi
2018-12-14  8:41   ` jiangyiwen
2018-12-14  8:41   ` jiangyiwen
2018-12-13 16:20 ` Stefan Hajnoczi
2018-12-12  9:31 jiangyiwen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='20181213092619-mutt-send-email-mst__31825.9296509952$1544711271$gmane$org@kernel.org' \
    --to=mst@redhat.com \
    --cc=jiangyiwen@huawei.com \
    --cc=kvm@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=stefanha@redhat.com \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.