All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mina Almasry <almasrymina@google.com>
To: Shailend Chand <shailend@google.com>,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-arch@vger.kernel.org,
	linux-kselftest@vger.kernel.org, bpf@vger.kernel.org,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org
Cc: "Mina Almasry" <almasrymina@google.com>,
	"David S. Miller" <davem@davemloft.net>,
	"Eric Dumazet" <edumazet@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Paolo Abeni" <pabeni@redhat.com>,
	"Jonathan Corbet" <corbet@lwn.net>,
	"Jeroen de Borst" <jeroendb@google.com>,
	"Praveen Kaligineedi" <pkaligineedi@google.com>,
	"Jesper Dangaard Brouer" <hawk@kernel.org>,
	"Ilias Apalodimas" <ilias.apalodimas@linaro.org>,
	"Arnd Bergmann" <arnd@arndb.de>,
	"David Ahern" <dsahern@kernel.org>,
	"Willem de Bruijn" <willemdebruijn.kernel@gmail.com>,
	"Shuah Khan" <shuah@kernel.org>,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	"Christian König" <christian.koenig@amd.com>,
	"Yunsheng Lin" <linyunsheng@huawei.com>,
	"Harshitha Ramamurthy" <hramamurthy@google.com>,
	"Shakeel Butt" <shakeelb@google.com>,
	"Willem de Bruijn" <willemb@google.com>,
	"Kaiyuan Zhang" <kaiyuanz@google.com>
Subject: [net-next v1 13/16] tcp: RX path for devmem TCP
Date: Thu,  7 Dec 2023 16:52:44 -0800	[thread overview]
Message-ID: <20231208005250.2910004-14-almasrymina@google.com> (raw)
In-Reply-To: <20231208005250.2910004-1-almasrymina@google.com>

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1. the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added
sk->sk_user_pages, and each page passed to userspace is get_page()'d.
This reference is dropped once the userspace indicates that it is
done reading this page.  All pages are released when the socket is
destroyed.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>

---

Changes in v1:
- Added dmabuf_id to dmabuf_cmsg (David/Stan).
- Devmem -> dmabuf (David).
- Change tcp_recvmsg_dmabuf() check to skb->dmabuf (Paolo).
- Use __skb_frag_ref() & napi_pp_put_page() for refcounting (Yunsheng).

RFC v3:
- Fixed issue with put_cmsg() failing silently.

---
 include/linux/socket.h            |   1 +
 include/net/page_pool/helpers.h   |   9 ++
 include/net/sock.h                |   2 +
 include/uapi/asm-generic/socket.h |   5 +
 include/uapi/linux/uio.h          |  10 ++
 net/ipv4/tcp.c                    | 190 +++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c               |   8 ++
 7 files changed, 220 insertions(+), 5 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index cfcb7e2c3813..fe2b9e2081bb 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -326,6 +326,7 @@ struct ucred {
 					  * plain text and require encryption
 					  */
 
+#define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_SPLICE_PAGES 0x8000000	/* Splice the pages from the iterator in sendmsg() */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 2d4e0a2c5620..e7e2e89d3663 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -108,6 +108,15 @@ page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
 	       ((dma_addr_t)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
 }
 
+static inline unsigned long
+page_pool_iov_virtual_addr(const struct page_pool_iov *ppiov)
+{
+	struct dmabuf_genpool_chunk_owner *owner = page_pool_iov_owner(ppiov);
+
+	return owner->base_virtual +
+	       ((unsigned long)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
+}
+
 static inline struct netdev_dmabuf_binding *
 page_pool_iov_binding(const struct page_pool_iov *ppiov)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 1d6931caf0c3..01029c855c1b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -353,6 +353,7 @@ struct sk_filter;
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
   *	@sk_bind2_node: bind node in the bhash2 table
+  *	@sk_user_pages: xarray of pages the user is holding a reference on.
   */
 struct sock {
 	/*
@@ -545,6 +546,7 @@ struct sock {
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
 	struct hlist_node	sk_bind2_node;
+	struct xarray		sk_user_pages;
 };
 
 enum sk_pacing {
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..25a2f5255f52 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -135,6 +135,11 @@
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SO_DEVMEM_LINEAR	98
+#define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
+#define SO_DEVMEM_DMABUF	99
+#define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..ad92e37699da 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -20,6 +20,16 @@ struct iovec
 	__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
 };
 
+struct dmabuf_cmsg {
+	__u64 frag_offset;	/* offset into the dmabuf where the frag starts.
+				 */
+	__u32 frag_size;	/* size of the frag. */
+	__u32 frag_token;	/* token representing this frag for
+				 * DEVMEM_DONTNEED.
+				 */
+	__u32  dmabuf_id;	/* dmabuf id this frag belongs to. */
+};
+
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5a3135e93d3d..088b2b48bee0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -461,6 +461,7 @@ void tcp_init_sock(struct sock *sk)
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
+	xa_init_flags(&sk->sk_user_pages, XA_FLAGS_ALLOC1);
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
@@ -2303,6 +2304,155 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(const struct sock *sk, const struct sk_buff *skb,
+			      unsigned int offset, struct msghdr *msg,
+			      int remaining_len)
+{
+	struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+	unsigned int start;
+	int i, copy, n;
+	int sent = 0;
+	int err = 0;
+
+	do {
+		start = skb_headlen(skb);
+
+		if (!skb->dmabuf) {
+			err = -ENODEV;
+			goto out;
+		}
+
+		/* Copy header. */
+		copy = start - offset;
+		if (copy > 0) {
+			copy = min(copy, remaining_len);
+
+			n = copy_to_iter(skb->data + offset, copy,
+					 &msg->msg_iter);
+			if (n != copy) {
+				err = -EFAULT;
+				goto out;
+			}
+
+			offset += copy;
+			remaining_len -= copy;
+
+			/* First a dmabuf_cmsg for # bytes copied to user
+			 * buffer.
+			 */
+			memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+			dmabuf_cmsg.frag_size = copy;
+			err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
+				       sizeof(dmabuf_cmsg), &dmabuf_cmsg);
+			if (err || msg->msg_flags & MSG_CTRUNC) {
+				msg->msg_flags &= ~MSG_CTRUNC;
+				if (!err)
+					err = -ETOOSMALL;
+				goto out;
+			}
+
+			sent += copy;
+
+			if (remaining_len == 0)
+				goto out;
+		}
+
+		/* after that, send information of dmabuf pages through a
+		 * sequence of cmsg
+		 */
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page_pool_iov *ppiov;
+			u64 frag_offset;
+			u32 user_token;
+			int end;
+
+			/* skb->dmabuf should indicate that ALL the frags in
+			 * this skb are dmabuf page_pool_iovs. We're checking
+			 * for that flag above, but also check individual frags
+			 * here. If the tcp stack is not setting skb->dmabuf
+			 * correctly, we still don't want to crash here when
+			 * accessing pgmap or priv below.
+			 */
+			if (!skb_frag_page_pool_iov(frag)) {
+				net_err_ratelimited("Found non-dmabuf skb with page_pool_iov");
+				err = -ENODEV;
+				goto out;
+			}
+
+			ppiov = skb_frag_page_pool_iov(frag);
+			end = start + skb_frag_size(frag);
+			copy = end - offset;
+
+			if (copy > 0) {
+				copy = min(copy, remaining_len);
+
+				frag_offset = page_pool_iov_virtual_addr(ppiov) +
+					      skb_frag_off(frag) + offset -
+					      start;
+				dmabuf_cmsg.frag_offset = frag_offset;
+				dmabuf_cmsg.frag_size = copy;
+				err = xa_alloc((struct xarray *)&sk->sk_user_pages,
+					       &user_token, frag->bv_page,
+					       xa_limit_31b, GFP_KERNEL);
+				if (err)
+					goto out;
+
+				dmabuf_cmsg.frag_token = user_token;
+				dmabuf_cmsg.dmabuf_id = page_pool_iov_binding_id(ppiov);
+
+				offset += copy;
+				remaining_len -= copy;
+
+				err = put_cmsg(msg, SOL_SOCKET,
+					       SO_DEVMEM_DMABUF,
+					       sizeof(dmabuf_cmsg),
+					       &dmabuf_cmsg);
+				if (err || msg->msg_flags & MSG_CTRUNC) {
+					msg->msg_flags &= ~MSG_CTRUNC;
+					xa_erase((struct xarray *)&sk->sk_user_pages,
+						 user_token);
+					if (!err)
+						err = -ETOOSMALL;
+					goto out;
+				}
+
+				__skb_frag_ref(frag);
+
+				sent += copy;
+
+				if (remaining_len == 0)
+					goto out;
+			}
+			start = end;
+		}
+
+		if (!remaining_len)
+			goto out;
+
+		/* if remaining_len is not satisfied yet, we need to go to the
+		 * next frag in the frag_list to satisfy remaining_len.
+		 */
+		skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+		offset = offset - start;
+	} while (skb);
+
+	if (remaining_len) {
+		err = -EFAULT;
+		goto out;
+	}
+
+out:
+	if (!sent)
+		sent = err;
+
+	return sent;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2316,6 +2466,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 			      int *cmsg_flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	int last_copied_dmabuf = -1; /* uninitialized */
 	int copied = 0;
 	u32 peek_seq;
 	u32 *seq;
@@ -2493,15 +2644,44 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_msg(skb, offset, msg, used);
-			if (err) {
-				/* Exception. Bailout! */
-				if (!copied)
-					copied = -EFAULT;
+			if (last_copied_dmabuf != -1 &&
+			    last_copied_dmabuf != skb->dmabuf)
 				break;
+
+			if (!skb->dmabuf) {
+				err = skb_copy_datagram_msg(skb, offset, msg,
+							    used);
+				if (err) {
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+			} else {
+				if (!(flags & MSG_SOCK_DEVMEM)) {
+					/* skb->dmabuf skbs can only be received
+					 * with the MSG_SOCK_DEVMEM flag.
+					 */
+					if (!copied)
+						copied = -EFAULT;
+
+					break;
+				}
+
+				err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+							 used);
+				if (err <= 0) {
+					if (!copied)
+						copied = -EFAULT;
+
+					break;
+				}
+				used = err;
 			}
 		}
 
+		last_copied_dmabuf = skb->dmabuf;
+
 		WRITE_ONCE(*seq, *seq + used);
 		copied += used;
 		len -= used;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 86cc6d36f818..986398cc2f65 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2501,6 +2501,14 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
 void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct page *page;
+	unsigned long index;
+
+	xa_for_each(&sk->sk_user_pages, index, page)
+		if (WARN_ON_ONCE(!napi_pp_put_page(page, false)))
+			page_pool_page_put_many(page, 1);
+
+	xa_destroy(&sk->sk_user_pages);
 
 	trace_tcp_destroy_sock(sk);
 
-- 
2.43.0.472.g3155946c3a-goog


WARNING: multiple messages have this Message-ID (diff)
From: Mina Almasry <almasrymina@google.com>
To: Shailend Chand <shailend@google.com>,
	netdev@vger.kernel.org,  linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,  linux-arch@vger.kernel.org,
	linux-kselftest@vger.kernel.org,  bpf@vger.kernel.org,
	linux-media@vger.kernel.org,  dri-devel@lists.freedesktop.org
Cc: "Mina Almasry" <almasrymina@google.com>,
	"Willem de Bruijn" <willemdebruijn.kernel@gmail.com>,
	"Kaiyuan Zhang" <kaiyuanz@google.com>,
	"Jeroen de Borst" <jeroendb@google.com>,
	"Jesper Dangaard Brouer" <hawk@kernel.org>,
	"Arnd Bergmann" <arnd@arndb.de>,
	"Jonathan Corbet" <corbet@lwn.net>,
	"David Ahern" <dsahern@kernel.org>,
	"Ilias Apalodimas" <ilias.apalodimas@linaro.org>,
	"Yunsheng Lin" <linyunsheng@huawei.com>,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	"Eric Dumazet" <edumazet@google.com>,
	"Shakeel Butt" <shakeelb@google.com>,
	"Willem de Bruijn" <willemb@google.com>,
	"Harshitha Ramamurthy" <hramamurthy@google.com>,
	"Praveen Kaligineedi" <pkaligineedi@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Christian König" <christian.koenig@amd.com>,
	"Paolo Abeni" <pabeni@redhat.com>,
	"Shuah Khan" <shuah@kernel.org>,
	"David S. Miller" <davem@davemloft.net>
Subject: [net-next v1 13/16] tcp: RX path for devmem TCP
Date: Thu,  7 Dec 2023 16:52:44 -0800	[thread overview]
Message-ID: <20231208005250.2910004-14-almasrymina@google.com> (raw)
In-Reply-To: <20231208005250.2910004-1-almasrymina@google.com>

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1. the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added
sk->sk_user_pages, and each page passed to userspace is get_page()'d.
This reference is dropped once the userspace indicates that it is
done reading this page.  All pages are released when the socket is
destroyed.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>

---

Changes in v1:
- Added dmabuf_id to dmabuf_cmsg (David/Stan).
- Devmem -> dmabuf (David).
- Change tcp_recvmsg_dmabuf() check to skb->dmabuf (Paolo).
- Use __skb_frag_ref() & napi_pp_put_page() for refcounting (Yunsheng).

RFC v3:
- Fixed issue with put_cmsg() failing silently.

---
 include/linux/socket.h            |   1 +
 include/net/page_pool/helpers.h   |   9 ++
 include/net/sock.h                |   2 +
 include/uapi/asm-generic/socket.h |   5 +
 include/uapi/linux/uio.h          |  10 ++
 net/ipv4/tcp.c                    | 190 +++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c               |   8 ++
 7 files changed, 220 insertions(+), 5 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index cfcb7e2c3813..fe2b9e2081bb 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -326,6 +326,7 @@ struct ucred {
 					  * plain text and require encryption
 					  */
 
+#define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_SPLICE_PAGES 0x8000000	/* Splice the pages from the iterator in sendmsg() */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 2d4e0a2c5620..e7e2e89d3663 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -108,6 +108,15 @@ page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
 	       ((dma_addr_t)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
 }
 
+static inline unsigned long
+page_pool_iov_virtual_addr(const struct page_pool_iov *ppiov)
+{
+	struct dmabuf_genpool_chunk_owner *owner = page_pool_iov_owner(ppiov);
+
+	return owner->base_virtual +
+	       ((unsigned long)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
+}
+
 static inline struct netdev_dmabuf_binding *
 page_pool_iov_binding(const struct page_pool_iov *ppiov)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 1d6931caf0c3..01029c855c1b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -353,6 +353,7 @@ struct sk_filter;
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
   *	@sk_bind2_node: bind node in the bhash2 table
+  *	@sk_user_pages: xarray of pages the user is holding a reference on.
   */
 struct sock {
 	/*
@@ -545,6 +546,7 @@ struct sock {
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
 	struct hlist_node	sk_bind2_node;
+	struct xarray		sk_user_pages;
 };
 
 enum sk_pacing {
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..25a2f5255f52 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -135,6 +135,11 @@
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SO_DEVMEM_LINEAR	98
+#define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
+#define SO_DEVMEM_DMABUF	99
+#define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..ad92e37699da 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -20,6 +20,16 @@ struct iovec
 	__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
 };
 
+struct dmabuf_cmsg {
+	__u64 frag_offset;	/* offset into the dmabuf where the frag starts.
+				 */
+	__u32 frag_size;	/* size of the frag. */
+	__u32 frag_token;	/* token representing this frag for
+				 * DEVMEM_DONTNEED.
+				 */
+	__u32  dmabuf_id;	/* dmabuf id this frag belongs to. */
+};
+
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5a3135e93d3d..088b2b48bee0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -461,6 +461,7 @@ void tcp_init_sock(struct sock *sk)
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
+	xa_init_flags(&sk->sk_user_pages, XA_FLAGS_ALLOC1);
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
@@ -2303,6 +2304,155 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(const struct sock *sk, const struct sk_buff *skb,
+			      unsigned int offset, struct msghdr *msg,
+			      int remaining_len)
+{
+	struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+	unsigned int start;
+	int i, copy, n;
+	int sent = 0;
+	int err = 0;
+
+	do {
+		start = skb_headlen(skb);
+
+		if (!skb->dmabuf) {
+			err = -ENODEV;
+			goto out;
+		}
+
+		/* Copy header. */
+		copy = start - offset;
+		if (copy > 0) {
+			copy = min(copy, remaining_len);
+
+			n = copy_to_iter(skb->data + offset, copy,
+					 &msg->msg_iter);
+			if (n != copy) {
+				err = -EFAULT;
+				goto out;
+			}
+
+			offset += copy;
+			remaining_len -= copy;
+
+			/* First a dmabuf_cmsg for # bytes copied to user
+			 * buffer.
+			 */
+			memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+			dmabuf_cmsg.frag_size = copy;
+			err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
+				       sizeof(dmabuf_cmsg), &dmabuf_cmsg);
+			if (err || msg->msg_flags & MSG_CTRUNC) {
+				msg->msg_flags &= ~MSG_CTRUNC;
+				if (!err)
+					err = -ETOOSMALL;
+				goto out;
+			}
+
+			sent += copy;
+
+			if (remaining_len == 0)
+				goto out;
+		}
+
+		/* after that, send information of dmabuf pages through a
+		 * sequence of cmsg
+		 */
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page_pool_iov *ppiov;
+			u64 frag_offset;
+			u32 user_token;
+			int end;
+
+			/* skb->dmabuf should indicate that ALL the frags in
+			 * this skb are dmabuf page_pool_iovs. We're checking
+			 * for that flag above, but also check individual frags
+			 * here. If the tcp stack is not setting skb->dmabuf
+			 * correctly, we still don't want to crash here when
+			 * accessing pgmap or priv below.
+			 */
+			if (!skb_frag_page_pool_iov(frag)) {
+				net_err_ratelimited("Found non-dmabuf skb with page_pool_iov");
+				err = -ENODEV;
+				goto out;
+			}
+
+			ppiov = skb_frag_page_pool_iov(frag);
+			end = start + skb_frag_size(frag);
+			copy = end - offset;
+
+			if (copy > 0) {
+				copy = min(copy, remaining_len);
+
+				frag_offset = page_pool_iov_virtual_addr(ppiov) +
+					      skb_frag_off(frag) + offset -
+					      start;
+				dmabuf_cmsg.frag_offset = frag_offset;
+				dmabuf_cmsg.frag_size = copy;
+				err = xa_alloc((struct xarray *)&sk->sk_user_pages,
+					       &user_token, frag->bv_page,
+					       xa_limit_31b, GFP_KERNEL);
+				if (err)
+					goto out;
+
+				dmabuf_cmsg.frag_token = user_token;
+				dmabuf_cmsg.dmabuf_id = page_pool_iov_binding_id(ppiov);
+
+				offset += copy;
+				remaining_len -= copy;
+
+				err = put_cmsg(msg, SOL_SOCKET,
+					       SO_DEVMEM_DMABUF,
+					       sizeof(dmabuf_cmsg),
+					       &dmabuf_cmsg);
+				if (err || msg->msg_flags & MSG_CTRUNC) {
+					msg->msg_flags &= ~MSG_CTRUNC;
+					xa_erase((struct xarray *)&sk->sk_user_pages,
+						 user_token);
+					if (!err)
+						err = -ETOOSMALL;
+					goto out;
+				}
+
+				__skb_frag_ref(frag);
+
+				sent += copy;
+
+				if (remaining_len == 0)
+					goto out;
+			}
+			start = end;
+		}
+
+		if (!remaining_len)
+			goto out;
+
+		/* if remaining_len is not satisfied yet, we need to go to the
+		 * next frag in the frag_list to satisfy remaining_len.
+		 */
+		skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+		offset = offset - start;
+	} while (skb);
+
+	if (remaining_len) {
+		err = -EFAULT;
+		goto out;
+	}
+
+out:
+	if (!sent)
+		sent = err;
+
+	return sent;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2316,6 +2466,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 			      int *cmsg_flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	int last_copied_dmabuf = -1; /* uninitialized */
 	int copied = 0;
 	u32 peek_seq;
 	u32 *seq;
@@ -2493,15 +2644,44 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_msg(skb, offset, msg, used);
-			if (err) {
-				/* Exception. Bailout! */
-				if (!copied)
-					copied = -EFAULT;
+			if (last_copied_dmabuf != -1 &&
+			    last_copied_dmabuf != skb->dmabuf)
 				break;
+
+			if (!skb->dmabuf) {
+				err = skb_copy_datagram_msg(skb, offset, msg,
+							    used);
+				if (err) {
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+			} else {
+				if (!(flags & MSG_SOCK_DEVMEM)) {
+					/* skb->dmabuf skbs can only be received
+					 * with the MSG_SOCK_DEVMEM flag.
+					 */
+					if (!copied)
+						copied = -EFAULT;
+
+					break;
+				}
+
+				err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+							 used);
+				if (err <= 0) {
+					if (!copied)
+						copied = -EFAULT;
+
+					break;
+				}
+				used = err;
 			}
 		}
 
+		last_copied_dmabuf = skb->dmabuf;
+
 		WRITE_ONCE(*seq, *seq + used);
 		copied += used;
 		len -= used;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 86cc6d36f818..986398cc2f65 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2501,6 +2501,14 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
 void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct page *page;
+	unsigned long index;
+
+	xa_for_each(&sk->sk_user_pages, index, page)
+		if (WARN_ON_ONCE(!napi_pp_put_page(page, false)))
+			page_pool_page_put_many(page, 1);
+
+	xa_destroy(&sk->sk_user_pages);
 
 	trace_tcp_destroy_sock(sk);
 
-- 
2.43.0.472.g3155946c3a-goog


  parent reply	other threads:[~2023-12-08  0:54 UTC|newest]

Thread overview: 145+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-08  0:52 [net-next v1 00/16] Device Memory TCP Mina Almasry
2023-12-08  0:52 ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 01/16] net: page_pool: factor out releasing DMA from releasing the page Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-10  3:49   ` Shakeel Butt
2023-12-10  3:49     ` Shakeel Butt
2023-12-12  8:11   ` Ilias Apalodimas
2023-12-12  8:11     ` Ilias Apalodimas
2023-12-08  0:52 ` [net-next v1 02/16] net: page_pool: create hooks for custom page providers Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-12  8:07   ` Ilias Apalodimas
2023-12-12  8:07     ` Ilias Apalodimas
2023-12-12 14:47     ` Mina Almasry
2023-12-12 14:47       ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 03/16] queue_api: define queue api Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-14  1:15   ` Jakub Kicinski
2023-12-14  1:15     ` Jakub Kicinski
2023-12-08  0:52 ` [net-next v1 04/16] gve: implement " Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2024-03-05 11:45   ` Arnd Bergmann
2023-12-08  0:52 ` [net-next v1 05/16] net: netdev netlink api to bind dma-buf to a net device Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-14  1:17   ` Jakub Kicinski
2023-12-14  1:17     ` Jakub Kicinski
2023-12-08  0:52 ` [net-next v1 06/16] netdev: support binding dma-buf to netdevice Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08 15:40   ` kernel test robot
2023-12-08 15:40     ` kernel test robot
2023-12-08 16:02   ` kernel test robot
2023-12-08 16:02     ` kernel test robot
2023-12-08 17:48   ` David Ahern
2023-12-08 17:48     ` David Ahern
2023-12-08 19:22     ` Mina Almasry
2023-12-08 19:22       ` Mina Almasry
2023-12-08 20:32       ` Mina Almasry
2023-12-08 20:32         ` Mina Almasry
2023-12-09 23:29       ` David Ahern
2023-12-09 23:29         ` David Ahern
2023-12-11  2:19         ` Mina Almasry
2023-12-11  2:19           ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 07/16] netdev: netdevice devmem allocator Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08 17:56   ` David Ahern
2023-12-08 17:56     ` David Ahern
2023-12-08 19:27     ` Mina Almasry
2023-12-08 19:27       ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 08/16] memory-provider: dmabuf devmem memory provider Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08 22:48   ` Pavel Begunkov
2023-12-08 22:48     ` Pavel Begunkov
2023-12-08 23:25     ` Mina Almasry
2023-12-08 23:25       ` Mina Almasry
2023-12-10  3:03       ` Pavel Begunkov
2023-12-10  3:03         ` Pavel Begunkov
2023-12-11  2:30         ` Mina Almasry
2023-12-11  2:30           ` Mina Almasry
2023-12-11 20:35           ` Pavel Begunkov
2023-12-11 20:35             ` Pavel Begunkov
2023-12-14 20:03             ` Mina Almasry
2023-12-14 20:03               ` Mina Almasry
2023-12-19 23:55               ` Pavel Begunkov
2023-12-19 23:55                 ` Pavel Begunkov
2023-12-08 23:05   ` Pavel Begunkov
2023-12-08 23:05     ` Pavel Begunkov
2023-12-12 12:25   ` Jason Gunthorpe
2023-12-12 12:25     ` Jason Gunthorpe
2023-12-12 13:07     ` Christoph Hellwig
2023-12-12 14:26     ` Mina Almasry
2023-12-12 14:26       ` Mina Almasry
2023-12-12 14:39       ` Jason Gunthorpe
2023-12-12 14:39         ` Jason Gunthorpe
2023-12-12 14:58         ` Mina Almasry
2023-12-12 14:58           ` Mina Almasry
2023-12-12 15:08           ` Jason Gunthorpe
2023-12-12 15:08             ` Jason Gunthorpe
2023-12-13  1:09             ` Mina Almasry
2023-12-13  1:09               ` Mina Almasry
2023-12-13  2:19               ` David Ahern
2023-12-13  2:19                 ` David Ahern
2023-12-13  7:49   ` Yinjun Zhang
2023-12-13  7:49     ` Yinjun Zhang
2023-12-08  0:52 ` [net-next v1 09/16] page_pool: device memory support Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08  9:30   ` Yunsheng Lin
2023-12-08  9:30     ` Yunsheng Lin
2023-12-08 16:05     ` Mina Almasry
2023-12-08 16:05       ` Mina Almasry
2023-12-11  2:04       ` Yunsheng Lin
2023-12-11  2:04         ` Yunsheng Lin
2023-12-11  2:26         ` Mina Almasry
2023-12-11  2:26           ` Mina Almasry
2023-12-11  4:04           ` Mina Almasry
2023-12-11  4:04             ` Mina Almasry
2023-12-11 11:51             ` Yunsheng Lin
2023-12-11 11:51               ` Yunsheng Lin
2023-12-11 18:14               ` Mina Almasry
2023-12-11 18:14                 ` Mina Almasry
2023-12-12 11:17                 ` Yunsheng Lin
2023-12-12 11:17                   ` Yunsheng Lin
2023-12-12 14:28                   ` Mina Almasry
2023-12-12 14:28                     ` Mina Almasry
2023-12-13 11:48                     ` Yunsheng Lin
2023-12-13 11:48                       ` Yunsheng Lin
2023-12-13  7:52             ` Mina Almasry
2023-12-13  7:52               ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 10/16] page_pool: don't release iov on elevanted refcount Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 11/16] net: support non paged skb frags Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 12/16] net: add support for skbs with unreadable frags Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08  0:52 ` Mina Almasry [this message]
2023-12-08  0:52   ` [net-next v1 13/16] tcp: RX path for devmem TCP Mina Almasry
2023-12-08 15:40   ` kernel test robot
2023-12-08 15:40     ` kernel test robot
2023-12-08 17:55   ` David Ahern
2023-12-08 17:55     ` David Ahern
2023-12-08 19:23     ` Mina Almasry
2023-12-08 19:23       ` Mina Almasry
2023-12-08  0:52 ` [net-next v1 14/16] net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-12 19:08   ` Simon Horman
2023-12-12 19:08     ` Simon Horman
2023-12-08  0:52 ` [net-next v1 15/16] net: add devmem TCP documentation Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-12 19:14   ` Simon Horman
2023-12-12 19:14     ` Simon Horman
2023-12-08  0:52 ` [net-next v1 16/16] selftests: add ncdevmem, netcat for devmem TCP Mina Almasry
2023-12-08  0:52   ` Mina Almasry
2023-12-08  1:47 ` [net-next v1 00/16] Device Memory TCP Mina Almasry
2023-12-08  1:47   ` Mina Almasry
2023-12-08 17:57 ` David Ahern
2023-12-08 17:57   ` David Ahern
2023-12-08 19:31   ` Mina Almasry
2023-12-08 19:31     ` Mina Almasry
2023-12-10  3:48 ` Shakeel Butt
2023-12-10  3:48   ` Shakeel Butt
2023-12-12  5:58 ` Christoph Hellwig
2023-12-14  6:20 ` patchwork-bot+netdevbpf
2023-12-14  6:20   ` patchwork-bot+netdevbpf
2023-12-14  6:48   ` Christoph Hellwig
2023-12-14  6:51     ` Mina Almasry
2023-12-14  6:51       ` Mina Almasry
2023-12-14  6:59       ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231208005250.2910004-14-almasrymina@google.com \
    --to=almasrymina@google.com \
    --cc=arnd@arndb.de \
    --cc=bpf@vger.kernel.org \
    --cc=christian.koenig@amd.com \
    --cc=corbet@lwn.net \
    --cc=davem@davemloft.net \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=hawk@kernel.org \
    --cc=hramamurthy@google.com \
    --cc=ilias.apalodimas@linaro.org \
    --cc=jeroendb@google.com \
    --cc=kaiyuanz@google.com \
    --cc=kuba@kernel.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-media@vger.kernel.org \
    --cc=linyunsheng@huawei.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pkaligineedi@google.com \
    --cc=shailend@google.com \
    --cc=shakeelb@google.com \
    --cc=shuah@kernel.org \
    --cc=sumit.semwal@linaro.org \
    --cc=willemb@google.com \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.