All of lore.kernel.org
 help / color / mirror / Atom feed
From: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
To: Stefano Garzarella <sgarzare@redhat.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	Jason Wang <jasowang@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	"edumazet@google.com" <edumazet@google.com>,
	Paolo Abeni <pabeni@redhat.com>, Jakub Kicinski <kuba@kernel.org>,
	Krasnov Arseniy <oxffffaa@gmail.com>
Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"virtualization@lists.linux-foundation.org" 
	<virtualization@lists.linux-foundation.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	kernel <kernel@sberdevices.ru>
Subject: [RFC PATCH v3 04/11] virtio/vsock: add transport zerocopy callback
Date: Sun, 6 Nov 2022 19:41:50 +0000	[thread overview]
Message-ID: <a64b20e2-8dd7-9686-97b6-489f63d178d2@sberdevices.ru> (raw)
In-Reply-To: <f60d7e94-795d-06fd-0321-6972533700c5@sberdevices.ru>

This adds transport callback which processes rx queue of socket and
instead of copying data to user provided buffer, it inserts data pages
of each packet to user's vm area.

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
---
 include/linux/virtio_vsock.h            |   7 +
 include/uapi/linux/virtio_vsock.h       |  14 ++
 net/vmw_vsock/virtio_transport_common.c | 244 +++++++++++++++++++++++-
 3 files changed, 261 insertions(+), 4 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c1be40f89a89..d10fdfd8d144 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -37,6 +37,7 @@ struct virtio_vsock_sock {
 	u32 buf_alloc;
 	struct list_head rx_queue;
 	u32 msg_count;
+	struct page *usr_poll_page;
 };
 
 struct virtio_vsock_pkt {
@@ -51,6 +52,7 @@ struct virtio_vsock_pkt {
 	bool reply;
 	bool tap_delivered;
 	bool slab_buf;
+	bool split;
 };
 
 struct virtio_vsock_pkt_info {
@@ -131,6 +133,11 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk,
 				struct sockaddr_vm *addr);
 bool virtio_transport_dgram_allow(u32 cid, u32 port);
 
+int virtio_transport_zerocopy_init(struct vsock_sock *vsk,
+				   struct vm_area_struct *vma);
+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
+				      struct page **pages,
+				      unsigned long *pages_num);
 int virtio_transport_connect(struct vsock_sock *vsk);
 
 int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 64738838bee5..2a0e4f309918 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -66,6 +66,20 @@ struct virtio_vsock_hdr {
 	__le32	fwd_cnt;
 } __attribute__((packed));
 
+struct virtio_vsock_usr_hdr {
+	u32 flags;
+	u32 len;
+} __attribute__((packed));
+
+#define VIRTIO_VSOCK_USR_POLL_NO_DATA   0
+#define VIRTIO_VSOCK_USR_POLL_HAS_DATA  1
+#define VIRTIO_VSOCK_USR_POLL_SHUTDOWN ~0
+
+struct virtio_vsock_usr_hdr_pref {
+	u32 poll_value;
+	u32 hdr_num;
+} __attribute__((packed));
+
 enum virtio_vsock_type {
 	VIRTIO_VSOCK_TYPE_STREAM = 1,
 	VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 444764869670..fa4a2688a5d5 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -12,6 +12,7 @@
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/virtio_vsock.h>
+#include <linux/mm.h>
 #include <uapi/linux/vsockmon.h>
 
 #include <net/sock.h>
@@ -241,6 +242,14 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
 					struct virtio_vsock_pkt *pkt)
 {
+	if (vvs->usr_poll_page) {
+		struct virtio_vsock_usr_hdr_pref *hdr;
+
+		hdr = (struct virtio_vsock_usr_hdr_pref *)page_to_virt(vvs->usr_poll_page);
+
+		hdr->poll_value = VIRTIO_VSOCK_USR_POLL_HAS_DATA;
+	}
+
 	if (vvs->rx_bytes + pkt->len > vvs->buf_alloc)
 		return false;
 
@@ -253,6 +262,14 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
 {
 	vvs->rx_bytes -= pkt->len;
 	vvs->fwd_cnt += pkt->len;
+
+	if (!vvs->rx_bytes && vvs->usr_poll_page) {
+		struct virtio_vsock_usr_hdr_pref *hdr;
+
+		hdr = (struct virtio_vsock_usr_hdr_pref *)page_to_virt(vvs->usr_poll_page);
+
+		hdr->poll_value = VIRTIO_VSOCK_USR_POLL_NO_DATA;
+	}
 }
 
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
@@ -347,6 +364,191 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
 	return err;
 }
 
+int virtio_transport_zerocopy_init(struct vsock_sock *vsk,
+				   struct vm_area_struct *vma)
+{
+	struct virtio_vsock_sock *vvs;
+	int err = 0;
+
+	if (vma->vm_end - vma->vm_start < 2 * PAGE_SIZE)
+		return -EINVAL;
+
+	vvs = vsk->trans;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	if (!vvs->usr_poll_page) {
+		/* GFP_ATOMIC because of spinlock. */
+		vvs->usr_poll_page = alloc_page(GFP_KERNEL | GFP_ATOMIC);
+
+		if (!vvs->usr_poll_page) {
+			err = -ENOMEM;
+		} else {
+			struct virtio_vsock_usr_hdr_pref *usr_hdr_pref;
+			unsigned long one_page = 1;
+
+			usr_hdr_pref = page_to_virt(vvs->usr_poll_page);
+
+			if (vsk->peer_shutdown & SHUTDOWN_MASK) {
+				usr_hdr_pref->poll_value = VIRTIO_VSOCK_USR_POLL_SHUTDOWN;
+			} else {
+				usr_hdr_pref->poll_value = vvs->rx_bytes ?
+						VIRTIO_VSOCK_USR_POLL_HAS_DATA :
+						VIRTIO_VSOCK_USR_POLL_NO_DATA;
+			}
+
+			usr_hdr_pref->hdr_num = 0;
+
+			err = vm_insert_pages(vma, vma->vm_start,
+					      &vvs->usr_poll_page,
+					      &one_page);
+
+			if (one_page)
+				err = -EINVAL;
+		}
+	} else {
+		err = -EINVAL;
+	}
+
+	spin_unlock_bh(&vvs->rx_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_zerocopy_init);
+
+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
+				      struct page **pages,
+				      unsigned long *pages_num)
+{
+	struct virtio_vsock_usr_hdr_pref *usr_hdr_pref;
+	struct virtio_vsock_usr_hdr *usr_hdr_buffer;
+	struct virtio_vsock_sock *vvs;
+	unsigned long max_usr_hdrs;
+	struct page *usr_hdr_page;
+	int pages_cnt;
+
+	if (*pages_num < 2)
+		return -EINVAL;
+
+	vvs = vsk->trans;
+
+	max_usr_hdrs = (PAGE_SIZE - sizeof(*usr_hdr_pref)) / sizeof(*usr_hdr_buffer);
+	*pages_num = min(max_usr_hdrs, *pages_num);
+	pages_cnt = 0;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	if (!vvs->usr_poll_page) {
+		spin_unlock_bh(&vvs->rx_lock);
+		return -EINVAL;
+	}
+
+	usr_hdr_page = vvs->usr_poll_page;
+	usr_hdr_pref = page_to_virt(usr_hdr_page);
+	usr_hdr_buffer = (struct virtio_vsock_usr_hdr *)(usr_hdr_pref + 1);
+	usr_hdr_pref->hdr_num = 0;
+
+	/* If ref counter is 1, then page owned during
+	 * allocation and not mapped, so insert it to
+	 * the output array. It will be mapped.
+	 */
+	if (page_ref_count(usr_hdr_page) == 1) {
+		pages[pages_cnt++] = usr_hdr_page;
+		/* Inc ref one more, as AF_VSOCK layer calls
+		 * 'put_page()' for each returned page.
+		 */
+		get_page(usr_hdr_page);
+	} else {
+		pages[pages_cnt++] = NULL;
+	}
+
+	/* Polling page is already mapped. */
+	while (!list_empty(&vvs->rx_queue) &&
+	       pages_cnt < *pages_num) {
+		struct virtio_vsock_pkt *pkt;
+		ssize_t rest_data_bytes;
+		size_t moved_data_bytes;
+		unsigned long pg_offs;
+
+		pkt = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		rest_data_bytes = le32_to_cpu(pkt->hdr.len) - pkt->off;
+
+		/* For packets, bigger than one page, split it's
+		 * high order allocated buffer to 0 order pages.
+		 * Otherwise 'vm_insert_pages()' will fail, for
+		 * all pages except first.
+		 */
+		if (rest_data_bytes > PAGE_SIZE) {
+			/* High order buffer not split yet. */
+			if (!pkt->split) {
+				split_page(virt_to_page(pkt->buf),
+					   get_order(le32_to_cpu(pkt->hdr.len)));
+				pkt->split = true;
+			}
+		}
+
+		pg_offs = pkt->off;
+		moved_data_bytes = 0;
+
+		while (rest_data_bytes &&
+		       pages_cnt < *pages_num) {
+			struct page *buf_page;
+
+			buf_page = virt_to_page(pkt->buf + pg_offs);
+
+			pages[pages_cnt++] = buf_page;
+			/* Get reference to prevent this page being
+			 * returned to page allocator when packet will
+			 * be freed. Ref count will be 2.
+			 */
+			get_page(buf_page);
+			pg_offs += PAGE_SIZE;
+
+			if (rest_data_bytes >= PAGE_SIZE) {
+				moved_data_bytes += PAGE_SIZE;
+				rest_data_bytes -= PAGE_SIZE;
+			} else {
+				moved_data_bytes += rest_data_bytes;
+				rest_data_bytes = 0;
+			}
+		}
+
+		if (!rest_data_bytes)
+			usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
+		else
+			usr_hdr_buffer->flags = 0;
+
+		usr_hdr_buffer->len = moved_data_bytes;
+
+		usr_hdr_buffer++;
+		usr_hdr_pref->hdr_num++;
+
+		pkt->off = pg_offs;
+
+		if (rest_data_bytes == 0) {
+			list_del(&pkt->list);
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			virtio_transport_free_pkt(pkt);
+		}
+
+		/* Now ref count for all pages of packet is 1. */
+	}
+
+	if (*pages_num - 1 < max_usr_hdrs)
+		usr_hdr_buffer->len = 0;
+
+	spin_unlock_bh(&vvs->rx_lock);
+
+	virtio_transport_send_credit_update(vsk);
+
+	*pages_num = pages_cnt;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_zerocopy_dequeue);
+
 static ssize_t
 virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 				   struct msghdr *msg,
@@ -969,11 +1171,21 @@ void virtio_transport_release(struct vsock_sock *vsk)
 {
 	struct sock *sk = &vsk->sk;
 	bool remove_sock = true;
+	struct virtio_vsock_sock *vvs = vsk->trans;
 
 	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
 		remove_sock = virtio_transport_close(vsk);
 
 	if (remove_sock) {
+		spin_lock_bh(&vvs->rx_lock);
+
+		if (vvs->usr_poll_page) {
+			__free_page(vvs->usr_poll_page);
+			vvs->usr_poll_page = NULL;
+		}
+
+		spin_unlock_bh(&vvs->rx_lock);
+
 		sock_set_flag(sk, SOCK_DONE);
 		virtio_transport_remove_sock(vsk);
 	}
@@ -1077,6 +1289,7 @@ virtio_transport_recv_connected(struct sock *sk,
 				struct virtio_vsock_pkt *pkt)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
 	int err = 0;
 
 	switch (le16_to_cpu(pkt->hdr.op)) {
@@ -1095,6 +1308,19 @@ virtio_transport_recv_connected(struct sock *sk,
 			vsk->peer_shutdown |= RCV_SHUTDOWN;
 		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
 			vsk->peer_shutdown |= SEND_SHUTDOWN;
+
+		spin_lock_bh(&vvs->rx_lock);
+
+		if (vvs->usr_poll_page) {
+			struct virtio_vsock_usr_hdr_pref *hdr;
+
+			hdr = (struct virtio_vsock_usr_hdr_pref *)page_to_virt(vvs->usr_poll_page);
+
+			hdr->poll_value = 0xffffffff;
+		}
+
+		spin_unlock_bh(&vvs->rx_lock);
+
 		if (vsk->peer_shutdown == SHUTDOWN_MASK &&
 		    vsock_stream_has_data(vsk) <= 0 &&
 		    !sock_flag(sk, SOCK_DONE)) {
@@ -1343,11 +1569,21 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
 {
 	if (pkt->buf_len) {
-		if (pkt->slab_buf)
+		if (pkt->slab_buf) {
 			kvfree(pkt->buf);
-		else
-			free_pages((unsigned long)pkt->buf,
-				   get_order(pkt->buf_len));
+		} else {
+			unsigned int order = get_order(pkt->buf_len);
+			unsigned long buf = (unsigned long)pkt->buf;
+
+			if (pkt->split) {
+				int i;
+
+				for (i = 0; i < (1 << order); i++)
+					free_page(buf + i * PAGE_SIZE);
+			} else {
+				free_pages(buf, order);
+			}
+		}
 	}
 
 	kfree(pkt);
-- 
2.35.0

  parent reply	other threads:[~2022-11-06 19:42 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-06 19:33 [RFC PATCH v3 00/11] virtio/vsock: experimental zerocopy receive Arseniy Krasnov
2022-11-06 19:36 ` [RFC PATCH v3 01/11] virtio/vsock: rework packet allocation logic Arseniy Krasnov
2022-11-06 19:50   ` Christophe JAILLET
2022-11-07  5:23     ` Arseniy Krasnov
2022-11-07 21:24       ` Christophe JAILLET
2022-11-07 21:31         ` Arseniy Krasnov
2022-11-11 20:35   ` Bobby Eshleman
2022-11-06 19:38 ` [RFC PATCH v3 02/11] virtio/vsock: update, 'virtio_transport_recv_pkt()' Arseniy Krasnov
2022-11-06 19:40 ` [RFC PATCH v3 03/11] af_vsock: add zerocopy receive logic Arseniy Krasnov
2022-11-11 13:55   ` Stefano Garzarella
2022-11-11 13:55     ` Stefano Garzarella
2022-11-06 19:41 ` Arseniy Krasnov [this message]
2022-11-10 11:15   ` [RFC PATCH v3 04/11] virtio/vsock: add transport zerocopy callback Arseniy Krasnov
2022-11-06 19:43 ` [RFC PATCH v3 05/11] vhost/vsock: switch packet's buffer allocation Arseniy Krasnov
2022-11-06 19:45 ` [RFC PATCH v3 06/11] vhost/vsock: enable zerocopy callback Arseniy Krasnov
2022-11-06 19:47 ` [RFC PATCH v3 07/11] virtio/vsock: " Arseniy Krasnov
2022-11-06 19:48 ` [RFC PATCH v3 08/11] test/vsock: rework message bound test Arseniy Krasnov
2022-11-11 14:00   ` Stefano Garzarella
2022-11-11 14:00     ` Stefano Garzarella
2022-11-06 19:50 ` [RFC PATCH v3 09/11] test/vsock: add big message test Arseniy Krasnov
2022-11-06 19:51 ` [RFC PATCH v3 10/11] test/vsock: add receive zerocopy tests Arseniy Krasnov
2022-11-06 19:54 ` [RFC PATCH v3 11/11] test/vsock: vsock_rx_perf utility Arseniy Krasnov
2022-11-11 13:47 ` [RFC PATCH v3 00/11] virtio/vsock: experimental zerocopy receive Stefano Garzarella
2022-11-11 13:47   ` Stefano Garzarella
2022-11-11 14:06   ` Stefano Garzarella
2022-11-11 14:06     ` Stefano Garzarella
2022-11-11 18:35     ` Arseniy Krasnov
2022-11-11 20:45   ` Bobby Eshleman
2022-11-12 11:40     ` Arseniy Krasnov
2022-11-13 10:04       ` Arseniy Krasnov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a64b20e2-8dd7-9686-97b6-489f63d178d2@sberdevices.ru \
    --to=avkrasnov@sberdevices.ru \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=jasowang@redhat.com \
    --cc=kernel@sberdevices.ru \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=oxffffaa@gmail.com \
    --cc=pabeni@redhat.com \
    --cc=sgarzare@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.