netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [Patch bpf-next v1 0/4] sockmap: some performance optimizations
@ 2022-04-10 16:10 Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-10 16:10 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang

From: Cong Wang <cong.wang@bytedance.com>

This patchset contains two optimizations for sockmap. The first one
eliminates a skb_clone() and the second one eliminates a memset(). After
this patchset, the throughput of UDP transmission via sockmap gets
improved by 61%.

Cong Wang (4):
  tcp: introduce tcp_read_skb()
  net: introduce a new proto_ops ->read_skb()
  skmsg: get rid of skb_clone()
  skmsg: get rid of unncessary memset()

 include/linux/net.h |  3 ++
 include/net/tcp.h   |  1 +
 include/net/udp.h   |  3 +-
 net/core/skmsg.c    | 48 ++++++++++++-------------------
 net/ipv4/af_inet.c  |  3 +-
 net/ipv4/tcp.c      | 69 +++++++++++++++++++++++++++++++++++++++------
 net/ipv4/udp.c      | 11 ++++----
 net/ipv6/af_inet6.c |  3 +-
 net/unix/af_unix.c  | 23 ++++++---------
 9 files changed, 102 insertions(+), 62 deletions(-)

-- 
2.32.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
@ 2022-04-10 16:10 ` Cong Wang
  2022-04-12 20:00   ` John Fastabend
                     ` (2 more replies)
  2022-04-10 16:10 ` [Patch bpf-next v1 2/4] net: introduce a new proto_ops ->read_skb() Cong Wang
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-10 16:10 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann, Jakub Sitnicki

From: Cong Wang <cong.wang@bytedance.com>

This patch inroduces tcp_read_skb() based on tcp_read_sock(),
a preparation for the next patch which actually introduces
a new sock ops.

TCP is special here, because it has tcp_read_sock() which is
mainly used by splice(). tcp_read_sock() supports partial read
and arbitrary offset, neither of them is needed for sockmap.

Cc: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp.c    | 72 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d50a662bf89..f0d4ce6855e1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -667,6 +667,8 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
+int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
+		 sk_read_actor_t recv_actor);
 
 void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e31cf137c614..8b054bcc6849 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1619,7 +1619,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off, bool unlink)
 {
 	struct sk_buff *skb;
 	u32 offset;
@@ -1632,6 +1632,8 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 		}
 		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
 			*off = offset;
+			if (unlink)
+				__skb_unlink(skb, &sk->sk_receive_queue);
 			return skb;
 		}
 		/* This looks weird, but this can happen if TCP collapsing
@@ -1665,7 +1667,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 	if (sk->sk_state == TCP_LISTEN)
 		return -ENOTCONN;
-	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+	while ((skb = tcp_recv_skb(sk, seq, &offset, false)) != NULL) {
 		if (offset < skb->len) {
 			int used;
 			size_t len;
@@ -1696,7 +1698,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			 * getting here: tcp_collapse might have deleted it
 			 * while aggregating skbs from the socket queue.
 			 */
-			skb = tcp_recv_skb(sk, seq - 1, &offset);
+			skb = tcp_recv_skb(sk, seq - 1, &offset, false);
 			if (!skb)
 				break;
 			/* TCP coalescing might have appended data to the skb.
@@ -1721,13 +1723,67 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 	/* Clean up data we have read: This will do ACK frames. */
 	if (copied > 0) {
-		tcp_recv_skb(sk, seq, &offset);
+		tcp_recv_skb(sk, seq, &offset, false);
 		tcp_cleanup_rbuf(sk, copied);
 	}
 	return copied;
 }
 EXPORT_SYMBOL(tcp_read_sock);
 
+int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
+		 sk_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	u32 offset;
+	int copied = 0;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+	while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {
+		if (offset < skb->len) {
+			int used;
+			size_t len;
+
+			len = skb->len - offset;
+			used = recv_actor(desc, skb, offset, len);
+			if (used <= 0) {
+				if (!copied)
+					copied = used;
+				break;
+			}
+			if (WARN_ON_ONCE(used > len))
+				used = len;
+			seq += used;
+			copied += used;
+			offset += used;
+
+			if (offset != skb->len)
+				continue;
+		}
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+			kfree_skb(skb);
+			++seq;
+			break;
+		}
+		kfree_skb(skb);
+		if (!desc->count)
+			break;
+		WRITE_ONCE(tp->copied_seq, seq);
+	}
+	WRITE_ONCE(tp->copied_seq, seq);
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (copied > 0)
+		tcp_cleanup_rbuf(sk, copied);
+
+	return copied;
+}
+EXPORT_SYMBOL(tcp_read_skb);
+
 int tcp_peek_len(struct socket *sock)
 {
 	return tcp_inq(sock->sk);
@@ -1910,7 +1966,7 @@ static int receive_fallback_to_copy(struct sock *sk,
 		struct sk_buff *skb;
 		u32 offset;
 
-		skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+		skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset, false);
 		if (skb)
 			tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
 	}
@@ -1957,7 +2013,7 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
 	if (skb) {
 		offset = *seq - TCP_SKB_CB(skb)->seq;
 	} else {
-		skb = tcp_recv_skb(sk, *seq, &offset);
+		skb = tcp_recv_skb(sk, *seq, &offset, false);
 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
 			tcp_update_recv_tstamps(skb, tss);
 			zc->msg_flags |= TCP_CMSG_TS;
@@ -2150,7 +2206,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
 				skb = skb->next;
 				offset = seq - TCP_SKB_CB(skb)->seq;
 			} else {
-				skb = tcp_recv_skb(sk, seq, &offset);
+				skb = tcp_recv_skb(sk, seq, &offset, false);
 			}
 
 			if (TCP_SKB_CB(skb)->has_rxtstamp) {
@@ -2206,7 +2262,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
 		tcp_rcv_space_adjust(sk);
 
 		/* Clean up data we have read: This will do ACK frames. */
-		tcp_recv_skb(sk, seq, &offset);
+		tcp_recv_skb(sk, seq, &offset, false);
 		tcp_cleanup_rbuf(sk, length + copylen);
 		ret = 0;
 		if (length == zc->length)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch bpf-next v1 2/4] net: introduce a new proto_ops ->read_skb()
  2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
@ 2022-04-10 16:10 ` Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 3/4] skmsg: get rid of skb_clone() Cong Wang
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-10 16:10 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann, Jakub Sitnicki

From: Cong Wang <cong.wang@bytedance.com>

Currently both splice() and sockmap use ->read_sock() to
read skb from receive queue, but for sockmap we only read
one entire skb at a time, so ->read_sock() is too conservative
to use. Introduce a new proto_ops ->read_skb() which supports
this sematic, with this we can finally pass the ownership of
skb to recv actors.

For non-TCP protocols, all ->read_sock() can be simply
converted to ->read_skb().

Cc: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/net.h |  3 +++
 include/net/tcp.h   |  3 +--
 include/net/udp.h   |  3 +--
 net/core/skmsg.c    | 20 +++++---------------
 net/ipv4/af_inet.c  |  3 ++-
 net/ipv4/tcp.c      |  9 +++------
 net/ipv4/udp.c      | 10 ++++------
 net/ipv6/af_inet6.c |  3 ++-
 net/unix/af_unix.c  | 23 +++++++++--------------
 9 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 12093f4db50c..adcc4e54ec4a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -152,6 +152,8 @@ struct module;
 struct sk_buff;
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 			       unsigned int, size_t);
+typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *);
+
 
 struct proto_ops {
 	int		family;
@@ -214,6 +216,7 @@ struct proto_ops {
 	 */
 	int		(*read_sock)(struct sock *sk, read_descriptor_t *desc,
 				     sk_read_actor_t recv_actor);
+	int		(*read_skb)(struct sock *sk, skb_read_actor_t recv_actor);
 	int		(*sendpage_locked)(struct sock *sk, struct page *page,
 					   int offset, size_t size, int flags);
 	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f0d4ce6855e1..56946d5e8160 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -667,8 +667,7 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
-int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
-		 sk_read_actor_t recv_actor);
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 
 void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index f1c2a88c9005..90cc590d42e3 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -305,8 +305,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			       struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
-int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
-		  sk_read_actor_t recv_actor);
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 
 /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
  * possibly multiple cache miss on dequeue()
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index cc381165ea08..19bca36940a2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1155,21 +1155,17 @@ static void sk_psock_done_strp(struct sk_psock *psock)
 }
 #endif /* CONFIG_BPF_STREAM_PARSER */
 
-static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
-				 unsigned int offset, size_t orig_len)
+static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
 {
-	struct sock *sk = (struct sock *)desc->arg.data;
 	struct sk_psock *psock;
 	struct bpf_prog *prog;
 	int ret = __SK_DROP;
-	int len = orig_len;
+	int len = skb->len;
 
 	/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
 	skb = skb_clone(skb, GFP_ATOMIC);
-	if (!skb) {
-		desc->error = -ENOMEM;
+	if (!skb)
 		return 0;
-	}
 
 	rcu_read_lock();
 	psock = sk_psock(sk);
@@ -1199,16 +1195,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 static void sk_psock_verdict_data_ready(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
-	read_descriptor_t desc;
 
-	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+	if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
 		return;
-
-	desc.arg.data = sk;
-	desc.error = 0;
-	desc.count = 1;
-
-	sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+	sock->ops->read_skb(sk, sk_psock_verdict_recv);
 }
 
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 72fde2888ad2..c60262bcac88 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1041,6 +1041,7 @@ const struct proto_ops inet_stream_ops = {
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
+	.read_skb	   = tcp_read_skb,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
 	.peek_len	   = tcp_peek_len,
@@ -1068,7 +1069,7 @@ const struct proto_ops inet_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
-	.read_sock	   = udp_read_sock,
+	.read_skb	   = udp_read_skb,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8b054bcc6849..74e472e8178f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1730,8 +1730,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 }
 EXPORT_SYMBOL(tcp_read_sock);
 
-int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
-		 sk_read_actor_t recv_actor)
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	struct sk_buff *skb;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1747,7 +1746,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
 			size_t len;
 
 			len = skb->len - offset;
-			used = recv_actor(desc, skb, offset, len);
+			used = recv_actor(sk, skb);
 			if (used <= 0) {
 				if (!copied)
 					copied = used;
@@ -1768,9 +1767,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
 			break;
 		}
 		kfree_skb(skb);
-		if (!desc->count)
-			break;
-		WRITE_ONCE(tp->copied_seq, seq);
+		break;
 	}
 	WRITE_ONCE(tp->copied_seq, seq);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b4d8361560f..9faca5758ed6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1796,8 +1796,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 }
 EXPORT_SYMBOL(__skb_recv_udp);
 
-int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
-		  sk_read_actor_t recv_actor)
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	int copied = 0;
 
@@ -1819,7 +1818,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			continue;
 		}
 
-		used = recv_actor(desc, skb, 0, skb->len);
+		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
 				copied = used;
@@ -1830,13 +1829,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		}
 
 		kfree_skb(skb);
-		if (!desc->count)
-			break;
+		break;
 	}
 
 	return copied;
 }
-EXPORT_SYMBOL(udp_read_sock);
+EXPORT_SYMBOL(udp_read_skb);
 
 /*
  * 	This should be easy, if there is something there we
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7d7b7523d126..06c1b16aa739 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -702,6 +702,7 @@ const struct proto_ops inet6_stream_ops = {
 	.sendpage_locked   = tcp_sendpage_locked,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
+	.read_skb	   = tcp_read_skb,
 	.peek_len	   = tcp_peek_len,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet6_compat_ioctl,
@@ -727,7 +728,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet6_sendmsg,		/* retpoline's sake */
 	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
-	.read_sock	   = udp_read_sock,
+	.read_skb	   = udp_read_skb,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fecbd95da918..06cf0570635d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 				       unsigned int flags);
 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
-			  sk_read_actor_t recv_actor);
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
-				 sk_read_actor_t recv_actor);
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 			      int, int);
 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = {
 	.shutdown =	unix_shutdown,
 	.sendmsg =	unix_stream_sendmsg,
 	.recvmsg =	unix_stream_recvmsg,
-	.read_sock =	unix_stream_read_sock,
+	.read_skb =	unix_stream_read_skb,
 	.mmap =		sock_no_mmap,
 	.sendpage =	unix_stream_sendpage,
 	.splice_read =	unix_stream_splice_read,
@@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
 	.sendmsg =	unix_dgram_sendmsg,
-	.read_sock =	unix_read_sock,
+	.read_skb =	unix_read_skb,
 	.recvmsg =	unix_dgram_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
@@ -2490,8 +2488,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
 	return __unix_dgram_recvmsg(sk, msg, size, flags);
 }
 
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
-			  sk_read_actor_t recv_actor)
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	int copied = 0;
 
@@ -2506,7 +2503,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 		if (!skb)
 			return err;
 
-		used = recv_actor(desc, skb, 0, skb->len);
+		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
 				copied = used;
@@ -2517,8 +2514,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 		}
 
 		kfree_skb(skb);
-		if (!desc->count)
-			break;
+		break;
 	}
 
 	return copied;
@@ -2653,13 +2649,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 }
 #endif
 
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
-				 sk_read_actor_t recv_actor)
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
 		return -ENOTCONN;
 
-	return unix_read_sock(sk, desc, recv_actor);
+	return unix_read_skb(sk, recv_actor);
 }
 
 static int unix_stream_read_generic(struct unix_stream_read_state *state,
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch bpf-next v1 3/4] skmsg: get rid of skb_clone()
  2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 2/4] net: introduce a new proto_ops ->read_skb() Cong Wang
@ 2022-04-10 16:10 ` Cong Wang
  2022-04-10 16:10 ` [Patch bpf-next v1 4/4] skmsg: get rid of unncessary memset() Cong Wang
  2022-04-26  9:27 ` [Patch bpf-next v1 0/4] sockmap: some performance optimizations Jakub Sitnicki
  4 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-10 16:10 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann, Jakub Sitnicki

From: Cong Wang <cong.wang@bytedance.com>

With ->read_skb() now we have an entire skb dequeued from
receive queue, now we just need to grab an addtional refcnt
before passing its ownership to recv actors.

And we should not touch them any more, particularly for
skb->sk. Fortunately, skb->sk is already set for most of
the protocols except UDP where skb->sk has been stolen,
so we have to fix it up for UDP case.

Cc: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 net/core/skmsg.c | 7 +------
 net/ipv4/udp.c   | 1 +
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 19bca36940a2..7aa37b6287e1 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1162,10 +1162,7 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
 	int ret = __SK_DROP;
 	int len = skb->len;
 
-	/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (!skb)
-		return 0;
+	skb_get(skb);
 
 	rcu_read_lock();
 	psock = sk_psock(sk);
@@ -1178,12 +1175,10 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
 	if (!prog)
 		prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		skb->sk = sk;
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
 		ret = bpf_prog_run_pin_on_cpu(prog, skb);
 		ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
-		skb->sk = NULL;
 	}
 	if (sk_psock_verdict_apply(psock, skb, ret) < 0)
 		len = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9faca5758ed6..dbf33f68555d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1818,6 +1818,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 			continue;
 		}
 
+		WARN_ON(!skb_set_owner_sk_safe(skb, sk));
 		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch bpf-next v1 4/4] skmsg: get rid of unncessary memset()
  2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
                   ` (2 preceding siblings ...)
  2022-04-10 16:10 ` [Patch bpf-next v1 3/4] skmsg: get rid of skb_clone() Cong Wang
@ 2022-04-10 16:10 ` Cong Wang
  2022-04-26  9:27 ` [Patch bpf-next v1 0/4] sockmap: some performance optimizations Jakub Sitnicki
  4 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-10 16:10 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, John Fastabend, Daniel Borkmann, Jakub Sitnicki

From: Cong Wang <cong.wang@bytedance.com>

We always allocate skmsg with kzalloc(), so there is no need
to call memset(0) on it, the only thing we need from
sk_msg_init() is sg_init_marker(). So introduce a new helper
which is just kzalloc()+sg_init_marker(), this saves an
unncessary memset(0) for skmsg on fast path.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 net/core/skmsg.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 7aa37b6287e1..d165d81c1e4a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -497,23 +497,27 @@ bool sk_msg_is_readable(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_msg_is_readable);
 
-static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
-						  struct sk_buff *skb)
+static struct sk_msg *alloc_sk_msg(void)
 {
 	struct sk_msg *msg;
 
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
+	if (unlikely(!msg))
 		return NULL;
+	sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
+	return msg;
+}
 
-	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+						  struct sk_buff *skb)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 		return NULL;
 
-	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
-	if (unlikely(!msg))
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
 		return NULL;
 
-	sk_msg_init(msg);
-	return msg;
+	return alloc_sk_msg();
 }
 
 static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
@@ -586,13 +590,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
 static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
 				     u32 off, u32 len)
 {
-	struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	struct sk_msg *msg = alloc_sk_msg();
 	struct sock *sk = psock->sk;
 	int err;
 
 	if (unlikely(!msg))
 		return -EAGAIN;
-	sk_msg_init(msg);
 	skb_set_owner_r(skb, sk);
 	err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
 	if (err < 0)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* RE: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
@ 2022-04-12 20:00   ` John Fastabend
  2022-04-21 19:00     ` Cong Wang
  2022-04-25 19:07   ` Jakub Kicinski
  2022-04-26  9:11   ` Jakub Sitnicki
  2 siblings, 1 reply; 16+ messages in thread
From: John Fastabend @ 2022-04-12 20:00 UTC (permalink / raw)
  To: Cong Wang, netdev
  Cc: Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann, Jakub Sitnicki

Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> This patch inroduces tcp_read_skb() based on tcp_read_sock(),
> a preparation for the next patch which actually introduces
> a new sock ops.
> 
> TCP is special here, because it has tcp_read_sock() which is
> mainly used by splice(). tcp_read_sock() supports partial read
> and arbitrary offset, neither of them is needed for sockmap.
> 
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---

Thanks for doing this Cong comment/question inline.

[...]

> +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> +		 sk_read_actor_t recv_actor)
> +{
> +	struct sk_buff *skb;
> +	struct tcp_sock *tp = tcp_sk(sk);
> +	u32 seq = tp->copied_seq;
> +	u32 offset;
> +	int copied = 0;
> +
> +	if (sk->sk_state == TCP_LISTEN)
> +		return -ENOTCONN;
> +	while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {

I'm trying to see why we might have an offset here if we always
consume the entire skb. There is a comment in tcp_recv_skb around
GRO packets, but not clear how this applies here if it does at all
to me yet. Will read a bit more I guess.

If the offset can be >0 than we also need to fix the recv_actor to
account for the extra offset in the skb. As is the bpf prog might
see duplicate data. This is a problem on the stream parser now.

Then another fallout is if offset is zero than we could just do
a skb_dequeue here and skip the tcp_recv_skb bool flag addition
and upate.

I'll continue reading after a few other things I need to get
sorted this afternoon, but maybe you have the answer on hand.

> +		if (offset < skb->len) {
> +			int used;
> +			size_t len;
> +
> +			len = skb->len - offset;
> +			used = recv_actor(desc, skb, offset, len);
> +			if (used <= 0) {
> +				if (!copied)
> +					copied = used;
> +				break;
> +			}
> +			if (WARN_ON_ONCE(used > len))
> +				used = len;
> +			seq += used;
> +			copied += used;
> +			offset += used;
> +
> +			if (offset != skb->len)
> +				continue;
> +		}
> +		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
> +			kfree_skb(skb);
> +			++seq;
> +			break;
> +		}
> +		kfree_skb(skb);
> +		if (!desc->count)
> +			break;
> +		WRITE_ONCE(tp->copied_seq, seq);
> +	}
> +	WRITE_ONCE(tp->copied_seq, seq);
> +
> +	tcp_rcv_space_adjust(sk);
> +
> +	/* Clean up data we have read: This will do ACK frames. */
> +	if (copied > 0)
> +		tcp_cleanup_rbuf(sk, copied);
> +
> +	return copied;
> +}
> +EXPORT_SYMBOL(tcp_read_skb);
> +

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-12 20:00   ` John Fastabend
@ 2022-04-21 19:00     ` Cong Wang
  2022-04-26  6:27       ` John Fastabend
  0 siblings, 1 reply; 16+ messages in thread
From: Cong Wang @ 2022-04-21 19:00 UTC (permalink / raw)
  To: John Fastabend
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	Daniel Borkmann, Jakub Sitnicki

On Tue, Apr 12, 2022 at 1:00 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > This patch inroduces tcp_read_skb() based on tcp_read_sock(),
> > a preparation for the next patch which actually introduces
> > a new sock ops.
> >
> > TCP is special here, because it has tcp_read_sock() which is
> > mainly used by splice(). tcp_read_sock() supports partial read
> > and arbitrary offset, neither of them is needed for sockmap.
> >
> > Cc: Eric Dumazet <edumazet@google.com>
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
>
> Thanks for doing this Cong comment/question inline.
>
> [...]
>
> > +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> > +              sk_read_actor_t recv_actor)
> > +{
> > +     struct sk_buff *skb;
> > +     struct tcp_sock *tp = tcp_sk(sk);
> > +     u32 seq = tp->copied_seq;
> > +     u32 offset;
> > +     int copied = 0;
> > +
> > +     if (sk->sk_state == TCP_LISTEN)
> > +             return -ENOTCONN;
> > +     while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {
>
> I'm trying to see why we might have an offset here if we always
> consume the entire skb. There is a comment in tcp_recv_skb around
> GRO packets, but not clear how this applies here if it does at all
> to me yet. Will read a bit more I guess.
>
> If the offset can be >0 than we also need to fix the recv_actor to
> account for the extra offset in the skb. As is the bpf prog might
> see duplicate data. This is a problem on the stream parser now.
>
> Then another fallout is if offset is zero than we could just do
> a skb_dequeue here and skip the tcp_recv_skb bool flag addition
> and upate.

I think it is mainly for splice(), and of course strparser, but none of
them is touched by my patchset.

>
> I'll continue reading after a few other things I need to get
> sorted this afternoon, but maybe you have the answer on hand.
>

Please let me know if you have any other comments.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
  2022-04-12 20:00   ` John Fastabend
@ 2022-04-25 19:07   ` Jakub Kicinski
  2022-04-30 17:22     ` Cong Wang
  2022-04-26  9:11   ` Jakub Sitnicki
  2 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2022-04-25 19:07 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann,
	Jakub Sitnicki

On Sun, 10 Apr 2022 09:10:39 -0700 Cong Wang wrote:
> +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> +		 sk_read_actor_t recv_actor)
> +{
> +	struct sk_buff *skb;
> +	struct tcp_sock *tp = tcp_sk(sk);
> +	u32 seq = tp->copied_seq;
> +	u32 offset;
> +	int copied = 0;
> +
> +	if (sk->sk_state == TCP_LISTEN)
> +		return -ENOTCONN;
> +	while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {
> +		if (offset < skb->len) {
> +			int used;
> +			size_t len;
> +
> +			len = skb->len - offset;
> +			used = recv_actor(desc, skb, offset, len);
> +			if (used <= 0) {
> +				if (!copied)
> +					copied = used;
> +				break;
> +			}
> +			if (WARN_ON_ONCE(used > len))
> +				used = len;
> +			seq += used;
> +			copied += used;
> +			offset += used;
> +
> +			if (offset != skb->len)
> +				continue;
> +		}
> +		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
> +			kfree_skb(skb);
> +			++seq;
> +			break;
> +		}
> +		kfree_skb(skb);
> +		if (!desc->count)
> +			break;
> +		WRITE_ONCE(tp->copied_seq, seq);
> +	}
> +	WRITE_ONCE(tp->copied_seq, seq);
> +
> +	tcp_rcv_space_adjust(sk);
> +
> +	/* Clean up data we have read: This will do ACK frames. */
> +	if (copied > 0)
> +		tcp_cleanup_rbuf(sk, copied);
> +
> +	return copied;
> +}
> +EXPORT_SYMBOL(tcp_read_skb);

I started prototyping a similar patch for TLS a while back but I have
two functions - one to get the skb and another to consume it. I thought
that's better for TLS, otherwise skbs stuck in the middle layer are not
counted towards the rbuf. Any thoughts on structuring the API that way?
I guess we can refactor that later, since TLS TCP-only we don't need
proto_ops plumbing there.

Overall 👍 for adding such API.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-21 19:00     ` Cong Wang
@ 2022-04-26  6:27       ` John Fastabend
  2022-04-30 17:17         ` Cong Wang
  0 siblings, 1 reply; 16+ messages in thread
From: John Fastabend @ 2022-04-26  6:27 UTC (permalink / raw)
  To: Cong Wang, John Fastabend
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	Daniel Borkmann, Jakub Sitnicki

Cong Wang wrote:
> On Tue, Apr 12, 2022 at 1:00 PM John Fastabend <john.fastabend@gmail.com> wrote:
> >
> > Cong Wang wrote:
> > > From: Cong Wang <cong.wang@bytedance.com>
> > >
> > > This patch inroduces tcp_read_skb() based on tcp_read_sock(),
> > > a preparation for the next patch which actually introduces
> > > a new sock ops.
> > >
> > > TCP is special here, because it has tcp_read_sock() which is
> > > mainly used by splice(). tcp_read_sock() supports partial read
> > > and arbitrary offset, neither of them is needed for sockmap.
> > >
> > > Cc: Eric Dumazet <edumazet@google.com>
> > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > ---
> >
> > Thanks for doing this Cong comment/question inline.
> >
> > [...]
> >
> > > +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> > > +              sk_read_actor_t recv_actor)
> > > +{
> > > +     struct sk_buff *skb;
> > > +     struct tcp_sock *tp = tcp_sk(sk);
> > > +     u32 seq = tp->copied_seq;
> > > +     u32 offset;
> > > +     int copied = 0;
> > > +
> > > +     if (sk->sk_state == TCP_LISTEN)
> > > +             return -ENOTCONN;
> > > +     while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {
> >
> > I'm trying to see why we might have an offset here if we always
> > consume the entire skb. There is a comment in tcp_recv_skb around
> > GRO packets, but not clear how this applies here if it does at all
> > to me yet. Will read a bit more I guess.
> >
> > If the offset can be >0 than we also need to fix the recv_actor to
> > account for the extra offset in the skb. As is the bpf prog might
> > see duplicate data. This is a problem on the stream parser now.
> >
> > Then another fallout is if offset is zero than we could just do
> > a skb_dequeue here and skip the tcp_recv_skb bool flag addition
> > and upate.
> 
> I think it is mainly for splice(), and of course strparser, but none of
> them is touched by my patchset.
> 
> >
> > I'll continue reading after a few other things I need to get
> > sorted this afternoon, but maybe you have the answer on hand.
> >
> 
> Please let me know if you have any other comments.

For now no more comments. If its not used then we can drop the offset
logic in this patch and the code looks much simpler.

> 
> Thanks.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
  2022-04-12 20:00   ` John Fastabend
  2022-04-25 19:07   ` Jakub Kicinski
@ 2022-04-26  9:11   ` Jakub Sitnicki
  2022-04-30 17:18     ` Cong Wang
  2 siblings, 1 reply; 16+ messages in thread
From: Jakub Sitnicki @ 2022-04-26  9:11 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, Cong Wang, Eric Dumazet, John Fastabend, Daniel Borkmann

On Sun, Apr 10, 2022 at 09:10 AM -07, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> This patch inroduces tcp_read_skb() based on tcp_read_sock(),
> a preparation for the next patch which actually introduces
> a new sock ops.
>
> TCP is special here, because it has tcp_read_sock() which is
> mainly used by splice(). tcp_read_sock() supports partial read
> and arbitrary offset, neither of them is needed for sockmap.
>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>  include/net/tcp.h |  2 ++
>  net/ipv4/tcp.c    | 72 +++++++++++++++++++++++++++++++++++++++++------
>  2 files changed, 66 insertions(+), 8 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 6d50a662bf89..f0d4ce6855e1 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -667,6 +667,8 @@ void tcp_get_info(struct sock *, struct tcp_info *);
>  /* Read 'sendfile()'-style from a TCP socket */
>  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
>  		  sk_read_actor_t recv_actor);
> +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> +		 sk_read_actor_t recv_actor);

Do you think it would be worth adding docs for the newly added function?
Why it exists and how is it different from the tcp_read_sock which has
the same interface?

[...]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 0/4] sockmap: some performance optimizations
  2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
                   ` (3 preceding siblings ...)
  2022-04-10 16:10 ` [Patch bpf-next v1 4/4] skmsg: get rid of unncessary memset() Cong Wang
@ 2022-04-26  9:27 ` Jakub Sitnicki
  2022-04-30 17:27   ` Cong Wang
  4 siblings, 1 reply; 16+ messages in thread
From: Jakub Sitnicki @ 2022-04-26  9:27 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, Cong Wang

On Sun, Apr 10, 2022 at 09:10 AM -07, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> This patchset contains two optimizations for sockmap. The first one
> eliminates a skb_clone() and the second one eliminates a memset(). After
> this patchset, the throughput of UDP transmission via sockmap gets
> improved by 61%.

That's a pretty exact number ;-)

Is this a measurement from metrics collected from a production
enviroment, or were you using a synthetic benchmark?

If it was the latter, would you be able to share the tooling?

I'm looking at extending the fio net engine with sockmap support, so
that we can have a reference benchmark. It would be helpful to see which
sockmap setup scenarios are worth focusing on.

Thanks,
Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-26  6:27       ` John Fastabend
@ 2022-04-30 17:17         ` Cong Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-30 17:17 UTC (permalink / raw)
  To: John Fastabend
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	Daniel Borkmann, Jakub Sitnicki

On Mon, Apr 25, 2022 at 11:27 PM John Fastabend
<john.fastabend@gmail.com> wrote:
>
> For now no more comments. If its not used then we can drop the offset
> logic in this patch and the code looks much simpler.

Good point, it is left there mainly for tcp_recv_skb(), but it can be dropped
for the rest.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-26  9:11   ` Jakub Sitnicki
@ 2022-04-30 17:18     ` Cong Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-30 17:18 UTC (permalink / raw)
  To: Jakub Sitnicki
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	John Fastabend, Daniel Borkmann

On Tue, Apr 26, 2022 at 2:12 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Sun, Apr 10, 2022 at 09:10 AM -07, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > This patch inroduces tcp_read_skb() based on tcp_read_sock(),
> > a preparation for the next patch which actually introduces
> > a new sock ops.
> >
> > TCP is special here, because it has tcp_read_sock() which is
> > mainly used by splice(). tcp_read_sock() supports partial read
> > and arbitrary offset, neither of them is needed for sockmap.
> >
> > Cc: Eric Dumazet <edumazet@google.com>
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >  include/net/tcp.h |  2 ++
> >  net/ipv4/tcp.c    | 72 +++++++++++++++++++++++++++++++++++++++++------
> >  2 files changed, 66 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 6d50a662bf89..f0d4ce6855e1 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -667,6 +667,8 @@ void tcp_get_info(struct sock *, struct tcp_info *);
> >  /* Read 'sendfile()'-style from a TCP socket */
> >  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
> >                 sk_read_actor_t recv_actor);
> > +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> > +              sk_read_actor_t recv_actor);
>
> Do you think it would be worth adding docs for the newly added function?
> Why it exists and how is it different from the tcp_read_sock which has
> the same interface?

Yeah, I will add some comments to explain this in V2.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-25 19:07   ` Jakub Kicinski
@ 2022-04-30 17:22     ` Cong Wang
  2022-05-02 16:13       ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Cong Wang @ 2022-04-30 17:22 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	John Fastabend, Daniel Borkmann, Jakub Sitnicki

On Mon, Apr 25, 2022 at 12:07 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sun, 10 Apr 2022 09:10:39 -0700 Cong Wang wrote:
> > +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
> > +              sk_read_actor_t recv_actor)
> > +{
> > +     struct sk_buff *skb;
> > +     struct tcp_sock *tp = tcp_sk(sk);
> > +     u32 seq = tp->copied_seq;
> > +     u32 offset;
> > +     int copied = 0;
> > +
> > +     if (sk->sk_state == TCP_LISTEN)
> > +             return -ENOTCONN;
> > +     while ((skb = tcp_recv_skb(sk, seq, &offset, true)) != NULL) {
> > +             if (offset < skb->len) {
> > +                     int used;
> > +                     size_t len;
> > +
> > +                     len = skb->len - offset;
> > +                     used = recv_actor(desc, skb, offset, len);
> > +                     if (used <= 0) {
> > +                             if (!copied)
> > +                                     copied = used;
> > +                             break;
> > +                     }
> > +                     if (WARN_ON_ONCE(used > len))
> > +                             used = len;
> > +                     seq += used;
> > +                     copied += used;
> > +                     offset += used;
> > +
> > +                     if (offset != skb->len)
> > +                             continue;
> > +             }
> > +             if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
> > +                     kfree_skb(skb);
> > +                     ++seq;
> > +                     break;
> > +             }
> > +             kfree_skb(skb);
> > +             if (!desc->count)
> > +                     break;
> > +             WRITE_ONCE(tp->copied_seq, seq);
> > +     }
> > +     WRITE_ONCE(tp->copied_seq, seq);
> > +
> > +     tcp_rcv_space_adjust(sk);
> > +
> > +     /* Clean up data we have read: This will do ACK frames. */
> > +     if (copied > 0)
> > +             tcp_cleanup_rbuf(sk, copied);
> > +
> > +     return copied;
> > +}
> > +EXPORT_SYMBOL(tcp_read_skb);
>
> I started prototyping a similar patch for TLS a while back but I have
> two functions - one to get the skb and another to consume it. I thought
> that's better for TLS, otherwise skbs stuck in the middle layer are not
> counted towards the rbuf. Any thoughts on structuring the API that way?
> I guess we can refactor that later, since TLS TCP-only we don't need
> proto_ops plumbing there.

Do you have a pointer to the source code? I am not sure how TLS uses
->read_sock() (or which interface is relevant).

>
> Overall 👍 for adding such API.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 0/4] sockmap: some performance optimizations
  2022-04-26  9:27 ` [Patch bpf-next v1 0/4] sockmap: some performance optimizations Jakub Sitnicki
@ 2022-04-30 17:27   ` Cong Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2022-04-30 17:27 UTC (permalink / raw)
  To: Jakub Sitnicki; +Cc: Linux Kernel Network Developers, Cong Wang

On Tue, Apr 26, 2022 at 2:33 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Sun, Apr 10, 2022 at 09:10 AM -07, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > This patchset contains two optimizations for sockmap. The first one
> > eliminates a skb_clone() and the second one eliminates a memset(). After
> > this patchset, the throughput of UDP transmission via sockmap gets
> > improved by 61%.
>
> That's a pretty exact number ;-)
>
> Is this a measurement from metrics collected from a production
> enviroment, or were you using a synthetic benchmark?
>
> If it was the latter, would you be able to share the tooling?

Sure, actually my colleague Jiang modified Cloudflare's TCP
sockmap code for UDP, here is the link:
https://github.com/Jiang1155/cloudflare-blog/tree/skmap-udp

>
> I'm looking at extending the fio net engine with sockmap support, so
> that we can have a reference benchmark. It would be helpful to see which
> sockmap setup scenarios are worth focusing on.
>

Sounds a good idea.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb()
  2022-04-30 17:22     ` Cong Wang
@ 2022-05-02 16:13       ` Jakub Kicinski
  0 siblings, 0 replies; 16+ messages in thread
From: Jakub Kicinski @ 2022-05-02 16:13 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Cong Wang, Eric Dumazet,
	John Fastabend, Daniel Borkmann, Jakub Sitnicki

On Sat, 30 Apr 2022 10:22:33 -0700 Cong Wang wrote:
> > I started prototyping a similar patch for TLS a while back but I have
> > two functions - one to get the skb and another to consume it. I thought
> > that's better for TLS, otherwise skbs stuck in the middle layer are not
> > counted towards the rbuf. Any thoughts on structuring the API that way?
> > I guess we can refactor that later, since TLS TCP-only we don't need
> > proto_ops plumbing there.  
> 
> Do you have a pointer to the source code? I am not sure how TLS uses
> ->read_sock() (or which interface is relevant).  

Nothing useful, I started hacking on removing strparser but then got
distracted with functional optimizations. TLS calls ->read_sock() thru
strparser.

With a little bit of code duplication TLS should be able to avoid 
the strparser's heavy machinery and cloning each skb.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-05-02 16:13 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-10 16:10 [Patch bpf-next v1 0/4] sockmap: some performance optimizations Cong Wang
2022-04-10 16:10 ` [Patch bpf-next v1 1/4] tcp: introduce tcp_read_skb() Cong Wang
2022-04-12 20:00   ` John Fastabend
2022-04-21 19:00     ` Cong Wang
2022-04-26  6:27       ` John Fastabend
2022-04-30 17:17         ` Cong Wang
2022-04-25 19:07   ` Jakub Kicinski
2022-04-30 17:22     ` Cong Wang
2022-05-02 16:13       ` Jakub Kicinski
2022-04-26  9:11   ` Jakub Sitnicki
2022-04-30 17:18     ` Cong Wang
2022-04-10 16:10 ` [Patch bpf-next v1 2/4] net: introduce a new proto_ops ->read_skb() Cong Wang
2022-04-10 16:10 ` [Patch bpf-next v1 3/4] skmsg: get rid of skb_clone() Cong Wang
2022-04-10 16:10 ` [Patch bpf-next v1 4/4] skmsg: get rid of unncessary memset() Cong Wang
2022-04-26  9:27 ` [Patch bpf-next v1 0/4] sockmap: some performance optimizations Jakub Sitnicki
2022-04-30 17:27   ` Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).