All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mina Almasry <almasrymina@google.com>
To: linux-kernel@vger.kernel.org, linux-media@vger.kernel.org,
	 dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
	 netdev@vger.kernel.org, linux-arch@vger.kernel.org,
	 linux-kselftest@vger.kernel.org
Cc: "Mina Almasry" <almasrymina@google.com>,
	"Willem de Bruijn" <willemdebruijn.kernel@gmail.com>,
	"Jesper Dangaard Brouer" <hawk@kernel.org>,
	"Arnd Bergmann" <arnd@arndb.de>,
	"Christian König" <christian.koenig@amd.com>,
	"David Ahern" <dsahern@kernel.org>,
	"Ilias Apalodimas" <ilias.apalodimas@linaro.org>,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	jgg@ziepe.ca, "Eric Dumazet" <edumazet@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Paolo Abeni" <pabeni@redhat.com>,
	"Shuah Khan" <shuah@kernel.org>,
	"David S. Miller" <davem@davemloft.net>
Subject: [RFC PATCH 05/10] tcp: implement recvmsg() RX path for devmem TCP
Date: Mon, 10 Jul 2023 15:32:56 -0700	[thread overview]
Message-ID: <20230710223304.1174642-6-almasrymina@google.com> (raw)
In-Reply-To: <20230710223304.1174642-1-almasrymina@google.com>

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1.  the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added sk->sk_pagepool,
and each page passed to userspace is get_page()'d.  This reference is
dropped once the userspace indicates that it is done reading this page.
All pages are released when the socket is destroyed.

Signed-off-by: Mina Almasry <almasrymina@google.com>
---
 include/linux/socket.h            |   1 +
 include/net/sock.h                |   2 +
 include/uapi/asm-generic/socket.h |   5 +
 include/uapi/linux/uio.h          |   6 +
 net/core/datagram.c               |   3 +
 net/ipv4/tcp.c                    | 186 +++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c               |   8 ++
 7 files changed, 209 insertions(+), 2 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 13c3a237b9c9..12905b2f1215 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -326,6 +326,7 @@ struct ucred {
 					  * plain text and require encryption
 					  */
 
+#define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
diff --git a/include/net/sock.h b/include/net/sock.h
index 6f428a7f3567..c615666ff19a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -353,6 +353,7 @@ struct sk_filter;
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
   *	@sk_bind2_node: bind node in the bhash2 table
+  *	@sk_pagepool: page pool associated with this socket.
   */
 struct sock {
 	/*
@@ -545,6 +546,7 @@ struct sock {
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
 	struct hlist_node	sk_bind2_node;
+	struct xarray		sk_pagepool;
 };
 
 enum sk_pacing {
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..88f9234f78cb 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,11 @@
 
 #define SO_RCVMARK		75
 
+#define SO_DEVMEM_HEADER	98
+#define SCM_DEVMEM_HEADER	SO_DEVMEM_HEADER
+#define SO_DEVMEM_OFFSET	99
+#define SCM_DEVMEM_OFFSET	SO_DEVMEM_OFFSET
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..8b0be0f50838 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -20,6 +20,12 @@ struct iovec
 	__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
 };
 
+struct cmsg_devmem {
+	__u32 frag_offset;
+	__u32 frag_size;
+	__u32 frag_token;
+};
+
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 176eb5834746..3a82598aa6ed 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -455,6 +455,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
 	skb_walk_frags(skb, frag_iter) {
 		int end;
 
+		if (frag_iter->devmem)
+			goto short_copy;
+
 		WARN_ON(start > offset + len);
 
 		end = start + frag_iter->len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 51e8d5872670..a894b8a9dbb0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
+#include <linux/dma-buf.h>
 
 /* Track pending CMSGs. */
 enum {
@@ -460,6 +461,7 @@ void tcp_init_sock(struct sock *sk)
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
+	xa_init_flags(&sk->sk_pagepool, XA_FLAGS_ALLOC);
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
@@ -2408,6 +2410,165 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+static int tcp_recvmsg_devmem(const struct sock *sk, const struct sk_buff *skb,
+			      unsigned int offset, struct msghdr *msg, int len)
+{
+	unsigned int start = skb_headlen(skb);
+	struct cmsg_devmem cmsg_devmem = { 0 };
+	unsigned int tokens_added_idx = 0;
+	int i, copy = start - offset, n;
+	struct sk_buff *frag_iter;
+	u32 *tokens_added;
+	int err = 0;
+
+	if (!skb->devmem)
+		return -ENODEV;
+
+	tokens_added = kzalloc(sizeof(u32) * skb_shinfo(skb)->nr_frags,
+			       GFP_KERNEL);
+
+	if (!tokens_added)
+		return -ENOMEM;
+
+	/* Copy header. */
+	if (copy > 0) {
+		copy = min(copy, len);
+
+		n = copy_to_iter(skb->data + offset, copy, &msg->msg_iter);
+		if (n != copy) {
+			err = -EFAULT;
+			goto err_release_pages;
+		}
+
+		offset += copy;
+		len -= copy;
+
+		/* First a cmsg_devmem for # bytes copied to user buffer */
+		cmsg_devmem.frag_size = copy;
+		err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_HEADER,
+			       sizeof(cmsg_devmem), &cmsg_devmem);
+		if (err)
+			goto err_release_pages;
+
+		if (len == 0)
+			goto out;
+	}
+
+	/* after that, send information of devmem pages through a sequence
+	 * of cmsg
+	 */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		struct page *page = skb_frag_page(frag);
+		struct dma_buf_pages *priv;
+		u32 user_token, frag_offset;
+		struct page *dmabuf_pages;
+		int end;
+
+		/* skb->devmem should indicate that ALL the pages in this skb
+                 * are dma buf pages. We're checking for that flag above, but
+                 * also check individual pages here. If the driver is not
+                 * setting skb->devmem correctly, we still don't want to crash
+                 * here when accessing pgmap or priv below.
+                 */
+		if (!is_dma_buf_page(page)) {
+			net_err_ratelimited("Found non-devmem skb with dma_buf "
+					    "page");
+			err = -ENODEV;
+			goto err_release_pages;
+		}
+
+		end = start + skb_frag_size(frag);
+		copy = end - offset;
+		memset(&cmsg_devmem, 0, sizeof(cmsg_devmem));
+
+		if (copy > 0) {
+			copy = min(copy, len);
+
+			priv = (struct dma_buf_pages *)page->pp->mp_priv;
+
+			dmabuf_pages = priv->pages;
+			frag_offset = ((page - dmabuf_pages) << PAGE_SHIFT) +
+				      skb_frag_off(frag) + offset - start;
+			cmsg_devmem.frag_offset = frag_offset;
+			cmsg_devmem.frag_size = copy;
+			err = xa_alloc((struct xarray *)&sk->sk_pagepool,
+				       &user_token, page, xa_limit_31b,
+				       GFP_KERNEL);
+			if (err)
+				goto err_release_pages;
+
+			tokens_added[tokens_added_idx++] = user_token;
+
+			get_page(page);
+			cmsg_devmem.frag_token = user_token;
+
+			offset += copy;
+			len -= copy;
+
+			err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_OFFSET,
+				       sizeof(cmsg_devmem), &cmsg_devmem);
+			if (err) {
+				put_page(page);
+				goto err_release_pages;
+			}
+
+			if (len == 0)
+				goto out;
+		}
+		start = end;
+	}
+
+	if (!len)
+		goto out;
+
+	/* if len is not satisfied yet, we need to skb_walk_frags() to satisfy
+	 * len
+	 */
+	skb_walk_frags(skb, frag_iter)
+	{
+		int end;
+
+		if (!frag_iter->devmem) {
+			err = -EFAULT;
+			goto err_release_pages;
+		}
+
+		WARN_ON(start > offset + len);
+		end = start + frag_iter->len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			err = tcp_recvmsg_devmem(sk, frag_iter, offset - start,
+						 msg, copy);
+			if (err)
+				goto err_release_pages;
+			len -= copy;
+			if (len == 0)
+				goto out;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (len) {
+		err = -EFAULT;
+		goto err_release_pages;
+	}
+
+	goto out;
+
+err_release_pages:
+	for (i = 0; i < tokens_added_idx; i++)
+		put_page(xa_erase((struct xarray *)&sk->sk_pagepool,
+				  tokens_added[i]));
+
+out:
+	kfree(tokens_added);
+	return err;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2428,7 +2589,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 	int err;
 	int target;		/* Read at least this many bytes */
 	long timeo;
-	struct sk_buff *skb, *last;
+	struct sk_buff *skb, *last, *skb_last_copied = NULL;
 	u32 urg_hole = 0;
 
 	err = -ENOTCONN;
@@ -2593,7 +2754,27 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 			}
 		}
 
-		if (!(flags & MSG_TRUNC)) {
+		if (skb_last_copied && skb_last_copied->devmem != skb->devmem)
+			break;
+
+		if (skb->devmem) {
+			if (!(flags & MSG_SOCK_DEVMEM)) {
+				/* skb->devmem skbs can only be received with
+				 * the MSG_SOCK_DEVMEM flag.
+				 */
+
+				copied = -EFAULT;
+				break;
+			}
+
+			err = tcp_recvmsg_devmem(sk, skb, offset, msg, used);
+			if (err) {
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			skb_last_copied = skb;
+		} else if (!(flags & MSG_TRUNC)) {
 			err = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (err) {
 				/* Exception. Bailout! */
@@ -2601,6 +2782,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 					copied = -EFAULT;
 				break;
 			}
+			skb_last_copied = skb;
 		}
 
 		WRITE_ONCE(*seq, *seq + used);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 06d2573685ca..d7dee38e0410 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2291,6 +2291,14 @@ void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	unsigned long index;
+	struct page *page;
+
+	xa_for_each(&sk->sk_pagepool, index, page)
+		put_page(page);
+
+	xa_destroy(&sk->sk_pagepool);
+
 	trace_tcp_destroy_sock(sk);
 
 	tcp_clear_xmit_timers(sk);
-- 
2.41.0.390.g38632f3daf-goog


WARNING: multiple messages have this Message-ID (diff)
From: Mina Almasry <almasrymina@google.com>
To: linux-kernel@vger.kernel.org, linux-media@vger.kernel.org,
	dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
	netdev@vger.kernel.org, linux-arch@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Cc: "Mina Almasry" <almasrymina@google.com>,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	"Christian König" <christian.koenig@amd.com>,
	"David S. Miller" <davem@davemloft.net>,
	"Eric Dumazet" <edumazet@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Paolo Abeni" <pabeni@redhat.com>,
	"Jesper Dangaard Brouer" <hawk@kernel.org>,
	"Ilias Apalodimas" <ilias.apalodimas@linaro.org>,
	"Arnd Bergmann" <arnd@arndb.de>,
	"David Ahern" <dsahern@kernel.org>,
	"Willem de Bruijn" <willemdebruijn.kernel@gmail.com>,
	"Shuah Khan" <shuah@kernel.org>,
	jgg@ziepe.ca
Subject: [RFC PATCH 05/10] tcp: implement recvmsg() RX path for devmem TCP
Date: Mon, 10 Jul 2023 15:32:56 -0700	[thread overview]
Message-ID: <20230710223304.1174642-6-almasrymina@google.com> (raw)
In-Reply-To: <20230710223304.1174642-1-almasrymina@google.com>

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1.  the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added sk->sk_pagepool,
and each page passed to userspace is get_page()'d.  This reference is
dropped once the userspace indicates that it is done reading this page.
All pages are released when the socket is destroyed.

Signed-off-by: Mina Almasry <almasrymina@google.com>
---
 include/linux/socket.h            |   1 +
 include/net/sock.h                |   2 +
 include/uapi/asm-generic/socket.h |   5 +
 include/uapi/linux/uio.h          |   6 +
 net/core/datagram.c               |   3 +
 net/ipv4/tcp.c                    | 186 +++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c               |   8 ++
 7 files changed, 209 insertions(+), 2 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 13c3a237b9c9..12905b2f1215 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -326,6 +326,7 @@ struct ucred {
 					  * plain text and require encryption
 					  */
 
+#define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
diff --git a/include/net/sock.h b/include/net/sock.h
index 6f428a7f3567..c615666ff19a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -353,6 +353,7 @@ struct sk_filter;
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
   *	@sk_bind2_node: bind node in the bhash2 table
+  *	@sk_pagepool: page pool associated with this socket.
   */
 struct sock {
 	/*
@@ -545,6 +546,7 @@ struct sock {
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
 	struct hlist_node	sk_bind2_node;
+	struct xarray		sk_pagepool;
 };
 
 enum sk_pacing {
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..88f9234f78cb 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,11 @@
 
 #define SO_RCVMARK		75
 
+#define SO_DEVMEM_HEADER	98
+#define SCM_DEVMEM_HEADER	SO_DEVMEM_HEADER
+#define SO_DEVMEM_OFFSET	99
+#define SCM_DEVMEM_OFFSET	SO_DEVMEM_OFFSET
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..8b0be0f50838 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -20,6 +20,12 @@ struct iovec
 	__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
 };
 
+struct cmsg_devmem {
+	__u32 frag_offset;
+	__u32 frag_size;
+	__u32 frag_token;
+};
+
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 176eb5834746..3a82598aa6ed 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -455,6 +455,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
 	skb_walk_frags(skb, frag_iter) {
 		int end;
 
+		if (frag_iter->devmem)
+			goto short_copy;
+
 		WARN_ON(start > offset + len);
 
 		end = start + frag_iter->len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 51e8d5872670..a894b8a9dbb0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
+#include <linux/dma-buf.h>
 
 /* Track pending CMSGs. */
 enum {
@@ -460,6 +461,7 @@ void tcp_init_sock(struct sock *sk)
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
+	xa_init_flags(&sk->sk_pagepool, XA_FLAGS_ALLOC);
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
@@ -2408,6 +2410,165 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+static int tcp_recvmsg_devmem(const struct sock *sk, const struct sk_buff *skb,
+			      unsigned int offset, struct msghdr *msg, int len)
+{
+	unsigned int start = skb_headlen(skb);
+	struct cmsg_devmem cmsg_devmem = { 0 };
+	unsigned int tokens_added_idx = 0;
+	int i, copy = start - offset, n;
+	struct sk_buff *frag_iter;
+	u32 *tokens_added;
+	int err = 0;
+
+	if (!skb->devmem)
+		return -ENODEV;
+
+	tokens_added = kzalloc(sizeof(u32) * skb_shinfo(skb)->nr_frags,
+			       GFP_KERNEL);
+
+	if (!tokens_added)
+		return -ENOMEM;
+
+	/* Copy header. */
+	if (copy > 0) {
+		copy = min(copy, len);
+
+		n = copy_to_iter(skb->data + offset, copy, &msg->msg_iter);
+		if (n != copy) {
+			err = -EFAULT;
+			goto err_release_pages;
+		}
+
+		offset += copy;
+		len -= copy;
+
+		/* First a cmsg_devmem for # bytes copied to user buffer */
+		cmsg_devmem.frag_size = copy;
+		err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_HEADER,
+			       sizeof(cmsg_devmem), &cmsg_devmem);
+		if (err)
+			goto err_release_pages;
+
+		if (len == 0)
+			goto out;
+	}
+
+	/* after that, send information of devmem pages through a sequence
+	 * of cmsg
+	 */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		struct page *page = skb_frag_page(frag);
+		struct dma_buf_pages *priv;
+		u32 user_token, frag_offset;
+		struct page *dmabuf_pages;
+		int end;
+
+		/* skb->devmem should indicate that ALL the pages in this skb
+                 * are dma buf pages. We're checking for that flag above, but
+                 * also check individual pages here. If the driver is not
+                 * setting skb->devmem correctly, we still don't want to crash
+                 * here when accessing pgmap or priv below.
+                 */
+		if (!is_dma_buf_page(page)) {
+			net_err_ratelimited("Found non-devmem skb with dma_buf "
+					    "page");
+			err = -ENODEV;
+			goto err_release_pages;
+		}
+
+		end = start + skb_frag_size(frag);
+		copy = end - offset;
+		memset(&cmsg_devmem, 0, sizeof(cmsg_devmem));
+
+		if (copy > 0) {
+			copy = min(copy, len);
+
+			priv = (struct dma_buf_pages *)page->pp->mp_priv;
+
+			dmabuf_pages = priv->pages;
+			frag_offset = ((page - dmabuf_pages) << PAGE_SHIFT) +
+				      skb_frag_off(frag) + offset - start;
+			cmsg_devmem.frag_offset = frag_offset;
+			cmsg_devmem.frag_size = copy;
+			err = xa_alloc((struct xarray *)&sk->sk_pagepool,
+				       &user_token, page, xa_limit_31b,
+				       GFP_KERNEL);
+			if (err)
+				goto err_release_pages;
+
+			tokens_added[tokens_added_idx++] = user_token;
+
+			get_page(page);
+			cmsg_devmem.frag_token = user_token;
+
+			offset += copy;
+			len -= copy;
+
+			err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_OFFSET,
+				       sizeof(cmsg_devmem), &cmsg_devmem);
+			if (err) {
+				put_page(page);
+				goto err_release_pages;
+			}
+
+			if (len == 0)
+				goto out;
+		}
+		start = end;
+	}
+
+	if (!len)
+		goto out;
+
+	/* if len is not satisfied yet, we need to skb_walk_frags() to satisfy
+	 * len
+	 */
+	skb_walk_frags(skb, frag_iter)
+	{
+		int end;
+
+		if (!frag_iter->devmem) {
+			err = -EFAULT;
+			goto err_release_pages;
+		}
+
+		WARN_ON(start > offset + len);
+		end = start + frag_iter->len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			err = tcp_recvmsg_devmem(sk, frag_iter, offset - start,
+						 msg, copy);
+			if (err)
+				goto err_release_pages;
+			len -= copy;
+			if (len == 0)
+				goto out;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (len) {
+		err = -EFAULT;
+		goto err_release_pages;
+	}
+
+	goto out;
+
+err_release_pages:
+	for (i = 0; i < tokens_added_idx; i++)
+		put_page(xa_erase((struct xarray *)&sk->sk_pagepool,
+				  tokens_added[i]));
+
+out:
+	kfree(tokens_added);
+	return err;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2428,7 +2589,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 	int err;
 	int target;		/* Read at least this many bytes */
 	long timeo;
-	struct sk_buff *skb, *last;
+	struct sk_buff *skb, *last, *skb_last_copied = NULL;
 	u32 urg_hole = 0;
 
 	err = -ENOTCONN;
@@ -2593,7 +2754,27 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 			}
 		}
 
-		if (!(flags & MSG_TRUNC)) {
+		if (skb_last_copied && skb_last_copied->devmem != skb->devmem)
+			break;
+
+		if (skb->devmem) {
+			if (!(flags & MSG_SOCK_DEVMEM)) {
+				/* skb->devmem skbs can only be received with
+				 * the MSG_SOCK_DEVMEM flag.
+				 */
+
+				copied = -EFAULT;
+				break;
+			}
+
+			err = tcp_recvmsg_devmem(sk, skb, offset, msg, used);
+			if (err) {
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			skb_last_copied = skb;
+		} else if (!(flags & MSG_TRUNC)) {
 			err = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (err) {
 				/* Exception. Bailout! */
@@ -2601,6 +2782,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 					copied = -EFAULT;
 				break;
 			}
+			skb_last_copied = skb;
 		}
 
 		WRITE_ONCE(*seq, *seq + used);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 06d2573685ca..d7dee38e0410 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2291,6 +2291,14 @@ void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	unsigned long index;
+	struct page *page;
+
+	xa_for_each(&sk->sk_pagepool, index, page)
+		put_page(page);
+
+	xa_destroy(&sk->sk_pagepool);
+
 	trace_tcp_destroy_sock(sk);
 
 	tcp_clear_xmit_timers(sk);
-- 
2.41.0.390.g38632f3daf-goog


  parent reply	other threads:[~2023-07-10 22:34 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-10 22:32 [RFC PATCH 00/10] Device Memory TCP Mina Almasry
2023-07-10 22:32 ` Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 01/10] dma-buf: add support for paged attachment mappings Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-11  7:59   ` Christian König
2023-07-11  7:59     ` Christian König
2023-07-11 11:44     ` Mina Almasry
2023-07-11 11:44       ` Mina Almasry
2023-07-11 12:13       ` Christian König
2023-07-11 12:13         ` Christian König
2023-07-10 22:32 ` [RFC PATCH 02/10] dma-buf: add support for NET_RX pages Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 03/10] dma-buf: add support for NET_TX pages Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 04/10] net: add support for skbs with unreadable frags Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-10 22:32 ` Mina Almasry [this message]
2023-07-10 22:32   ` [RFC PATCH 05/10] tcp: implement recvmsg() RX path for devmem TCP Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 06/10] net: add SO_DEVMEM_DONTNEED setsockopt to release RX pages Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-16 23:57   ` Andy Lutomirski
2023-07-16 23:57     ` Andy Lutomirski
2023-07-17  2:06     ` Mina Almasry
2023-07-17  2:06       ` Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 07/10] tcp: implement sendmsg() TX path for for devmem tcp Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-10 22:32 ` [RFC PATCH 08/10] selftests: add ncdevmem, netcat for devmem TCP Mina Almasry
2023-07-10 22:32   ` Mina Almasry
2023-07-10 22:33 ` [RFC PATCH 09/10] memory-provider: updates core provider API " Mina Almasry
2023-07-10 22:33   ` Mina Almasry
2023-07-10 22:33 ` [RFC PATCH 10/10] memory-provider: add dmabuf devmem provider Mina Almasry
2023-07-10 22:33   ` Mina Almasry
2023-07-17  2:41 ` [RFC PATCH 00/10] Device Memory TCP Andy Lutomirski
2023-07-17  2:41   ` Andy Lutomirski
2023-07-18 17:32   ` Jakub Kicinski
2023-07-18 17:32     ` Jakub Kicinski
2023-07-18 17:36   ` Mina Almasry
2023-07-18 17:36     ` Mina Almasry
2023-07-18 18:06     ` Jason Gunthorpe
2023-07-18 18:06       ` Jason Gunthorpe
2023-07-18 18:15       ` Jakub Kicinski
2023-07-18 18:15         ` Jakub Kicinski
2023-07-18 18:20         ` David Ahern
2023-07-18 18:20           ` David Ahern
2023-07-18 18:29           ` Jakub Kicinski
2023-07-18 18:29             ` Jakub Kicinski
2023-07-18 22:35             ` David Ahern
2023-07-18 22:35               ` David Ahern
2023-07-18 22:45               ` Jakub Kicinski
2023-07-18 22:45                 ` Jakub Kicinski
2023-07-19 15:10                 ` Mina Almasry
2023-07-19 15:10                   ` Mina Almasry
2023-07-19 17:57                   ` Stephen Hemminger
2023-07-19 17:57                     ` Stephen Hemminger
2023-07-19 23:24                     ` Jason Gunthorpe
2023-07-19 23:24                       ` Jason Gunthorpe
2023-07-27 11:40                       ` [Linaro-mm-sig] " Christian König
2023-07-27 11:40                         ` Christian König
2023-07-19 20:36                   ` Jakub Kicinski
2023-07-19 20:36                     ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230710223304.1174642-6-almasrymina@google.com \
    --to=almasrymina@google.com \
    --cc=arnd@arndb.de \
    --cc=christian.koenig@amd.com \
    --cc=davem@davemloft.net \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=hawk@kernel.org \
    --cc=ilias.apalodimas@linaro.org \
    --cc=jgg@ziepe.ca \
    --cc=kuba@kernel.org \
    --cc=linaro-mm-sig@lists.linaro.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-media@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=shuah@kernel.org \
    --cc=sumit.semwal@linaro.org \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.