All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0/2] optimise UDP skb completion wakeups
@ 2024-02-07 14:23 Pavel Begunkov
  2024-02-07 14:23 ` [RFC 1/2] udp: introduce udp specific skb destructor Pavel Begunkov
  2024-02-07 14:23 ` [RFC 2/2] udp: optimise write wakeups with SOCK_NOSPACE Pavel Begunkov
  0 siblings, 2 replies; 3+ messages in thread
From: Pavel Begunkov @ 2024-02-07 14:23 UTC (permalink / raw)
  To: netdev, edumazet, davem, dsahern, pabeni, kuba; +Cc: Pavel Begunkov

sock_wfree() tries to wake even when there are only read waiters and no
write waiters, which is the common scenario for io_uring. It'd also
attempt to wake if the write queue is far from being full. To avoid most
of this overhead add SOCK_NOSPACE support for UDP.

It brings +5% to UDP throughput with a CPU bound benchmark, and I observed
it taking 0.5-2% according to profiles in more realistic workloads.

Patch 1 introduces a new destructor udp_wfree(). The optimisation can be
implemented in sock_wfree() but that would either require patching up all
poll callbacks across the tree that are used in couple with sock_wfree(),
or limiting it to UDP with a flag.

Another option considered was to split out a write waitqueue, perhaps
conditionally, but it's bulkier, and the current version should also
benefit epoll workloads.

Pavel Begunkov (2):
  udp: introduce udp specific skb destructor
  udp: optimise write wakeups with SOCK_NOSPACE

 drivers/net/veth.c | 10 +++++---
 include/net/sock.h |  1 +
 include/net/udp.h  |  1 +
 net/core/sock.c    | 10 +++++---
 net/ipv4/udp.c     | 58 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/udp.c     |  5 +++-
 6 files changed, 76 insertions(+), 9 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [RFC 1/2] udp: introduce udp specific skb destructor
  2024-02-07 14:23 [RFC 0/2] optimise UDP skb completion wakeups Pavel Begunkov
@ 2024-02-07 14:23 ` Pavel Begunkov
  2024-02-07 14:23 ` [RFC 2/2] udp: optimise write wakeups with SOCK_NOSPACE Pavel Begunkov
  1 sibling, 0 replies; 3+ messages in thread
From: Pavel Begunkov @ 2024-02-07 14:23 UTC (permalink / raw)
  To: netdev, edumazet, davem, dsahern, pabeni, kuba; +Cc: Pavel Begunkov

Introduce a UDP specific skb destructor, udp_wfree(). We'll need it in
the next patch, which wires SOCK_NOSPACE support for udp sockets. We
can't reuse sock_wfree() there as is because SOCK_NOSPACE also requires
support from the poll callback.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 drivers/net/veth.c | 10 +++++++---
 include/net/sock.h |  1 +
 include/net/udp.h  |  1 +
 net/core/sock.c    | 10 +++++++---
 net/ipv4/udp.c     | 41 ++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/udp.c     |  5 ++++-
 6 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 578e36ea1589..84ed58b957e5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,6 +19,7 @@
 #include <net/dst.h>
 #include <net/xfrm.h>
 #include <net/xdp.h>
+#include <net/udp.h>
 #include <linux/veth.h>
 #include <linux/module.h>
 #include <linux/bpf.h>
@@ -335,9 +336,12 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
 					 const struct net_device *rcv,
 					 const struct sk_buff *skb)
 {
-	return !(dev->features & NETIF_F_ALL_TSO) ||
-		(skb->destructor == sock_wfree &&
-		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+	if (skb->destructor == sock_wfree ||
+	    (IS_ENABLED(CONFIG_INET) && skb->destructor == udp_wfree))
+		return rcv->features & (NETIF_F_GRO_FRAGLIST |
+					NETIF_F_GRO_UDP_FWD);
+
+	return !(dev->features & NETIF_F_ALL_TSO);
 }
 
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/include/net/sock.h b/include/net/sock.h
index a7f815c7cfdf..bd58901ff87a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1811,6 +1811,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
+void __sk_free(struct sock *sk);
 void sk_destruct(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 void sk_free_unlock_clone(struct sock *sk);
diff --git a/include/net/udp.h b/include/net/udp.h
index 488a6d2babcc..bdb3dabf789d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -288,6 +288,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
 __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
+void udp_wfree(struct sk_buff *skb);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features,
 				       bool is_ipv6);
diff --git a/net/core/sock.c b/net/core/sock.c
index 158dbdebce6a..6172686635a8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -140,6 +140,7 @@
 #include <trace/events/sock.h>
 
 #include <net/tcp.h>
+#include <net/udp.h>
 #include <net/busy_poll.h>
 #include <net/phonet/phonet.h>
 
@@ -2221,7 +2222,7 @@ void sk_destruct(struct sock *sk)
 		__sk_destruct(&sk->sk_rcu);
 }
 
-static void __sk_free(struct sock *sk)
+void __sk_free(struct sock *sk)
 {
 	if (likely(sk->sk_net_refcnt))
 		sock_inuse_add(sock_net(sk), -1);
@@ -2231,6 +2232,7 @@ static void __sk_free(struct sock *sk)
 	else
 		sk_destruct(sk);
 }
+EXPORT_SYMBOL(__sk_free);
 
 void sk_free(struct sock *sk)
 {
@@ -2531,8 +2533,10 @@ static bool can_skb_orphan_partial(const struct sk_buff *skb)
 	if (skb->decrypted)
 		return false;
 #endif
-	return (skb->destructor == sock_wfree ||
-		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+	if (skb->destructor == sock_wfree)
+		return true;
+	return IS_ENABLED(CONFIG_INET) &&
+		(skb->destructor == tcp_wfree || skb->destructor == udp_wfree);
 }
 
 /* This helper is used by netem, as it can hold packets in its
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 148ffb007969..90ff77ab78f9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -796,6 +796,42 @@ int udp_err(struct sk_buff *skb, u32 info)
 	return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
 }
 
+static inline bool __udp_wfree(struct sk_buff *skb)
+{
+	struct socket_wq *wq;
+	struct sock *sk = skb->sk;
+	bool free;
+
+	free = refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc);
+	/* a full barrier is required before waitqueue_active() */
+	smp_mb__after_atomic();
+
+	if (!sock_writeable(sk))
+		goto out;
+
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq && waitqueue_active(&wq->wait))
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+						EPOLLWRNORM | EPOLLWRBAND);
+	/* Should agree with poll, otherwise some programs break */
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+out:
+	return free;
+}
+
+void udp_wfree(struct sk_buff *skb)
+{
+	bool free;
+
+	rcu_read_lock();
+	free = __udp_wfree(skb);
+	rcu_read_unlock();
+
+	if (unlikely(free))
+		__sk_free(skb->sk);
+}
+EXPORT_SYMBOL_GPL(udp_wfree);
+
 /*
  * Throw away all pending data and cancel the corking. Socket is locked.
  */
@@ -989,6 +1025,7 @@ int udp_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
+	skb->destructor = udp_wfree;
 	err = udp_send_skb(skb, fl4, &inet->cork.base);
 
 out:
@@ -1246,8 +1283,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				  sizeof(struct udphdr), &ipc, &rt,
 				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
-		if (!IS_ERR_OR_NULL(skb))
+		if (!IS_ERR_OR_NULL(skb)) {
+			skb->destructor = udp_wfree;
 			err = udp_send_skb(skb, fl4, &cork);
+		}
 		goto out;
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3f2249b4cd5f..d0c74a8f0914 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1309,6 +1309,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
+	skb->destructor = udp_wfree;
 	err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
 			      &inet_sk(sk)->cork.base);
 out:
@@ -1576,8 +1577,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				   (struct rt6_info *)dst,
 				   msg->msg_flags, &cork);
 		err = PTR_ERR(skb);
-		if (!IS_ERR_OR_NULL(skb))
+		if (!IS_ERR_OR_NULL(skb)) {
+			skb->destructor = udp_wfree;
 			err = udp_v6_send_skb(skb, fl6, &cork.base);
+		}
 		/* ip6_make_skb steals dst reference */
 		goto out_no_dst;
 	}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [RFC 2/2] udp: optimise write wakeups with SOCK_NOSPACE
  2024-02-07 14:23 [RFC 0/2] optimise UDP skb completion wakeups Pavel Begunkov
  2024-02-07 14:23 ` [RFC 1/2] udp: introduce udp specific skb destructor Pavel Begunkov
@ 2024-02-07 14:23 ` Pavel Begunkov
  1 sibling, 0 replies; 3+ messages in thread
From: Pavel Begunkov @ 2024-02-07 14:23 UTC (permalink / raw)
  To: netdev, edumazet, davem, dsahern, pabeni, kuba; +Cc: Pavel Begunkov

Often the write queue is never filled full, however every send skb would
still try to unnecessary wake the pollers via sock_wfree(). That holds
even if we don't have any write/POLLIN pollers as the socket's waitqueue
is rw mixed.

Optimise it with SOCK_NOSPACE, which avoids waking up unless there are
waiters that were starved of space. With a dummy device io_uring
benchmark pushing as much as it can, I've got +5% to CPU bound
throughput (2268 Krps -> 2380). Profiles showed ~3-4% total reduction
due to the change in the CPU share of destructors.

As noted in the previous patch, we introduced udp_wfree and it's not
based on sock_wfree() because SOCK_NOSPACE requires support from
the poll callback, and there seems to be a bunch of custom ones in the
tree.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 net/ipv4/udp.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 90ff77ab78f9..cacfbee71437 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -803,9 +803,13 @@ static inline bool __udp_wfree(struct sk_buff *skb)
 	bool free;
 
 	free = refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc);
-	/* a full barrier is required before waitqueue_active() */
+	/* a full barrier is required before waitqueue_active() and the
+	 * SOCK_NOSPACE test below.
+	 */
 	smp_mb__after_atomic();
 
+	if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+		goto out;
 	if (!sock_writeable(sk))
 		goto out;
 
@@ -2925,8 +2929,19 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	/* psock ingress_msg queue should not contain any bad checksum frames */
 	if (sk_is_readable(sk))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	return mask;
 
+	if (!sock_writeable(sk)) {
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		/* Order with the wspace read so either we observe it
+		 * writeable or udp_sock_wfree() would find SOCK_NOSPACE and
+		 * wake us up.
+		 */
+		smp_mb__after_atomic();
+
+		if (sock_writeable(sk))
+			mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
+	}
+	return mask;
 }
 EXPORT_SYMBOL(udp_poll);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-02-07 14:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-07 14:23 [RFC 0/2] optimise UDP skb completion wakeups Pavel Begunkov
2024-02-07 14:23 ` [RFC 1/2] udp: introduce udp specific skb destructor Pavel Begunkov
2024-02-07 14:23 ` [RFC 2/2] udp: optimise write wakeups with SOCK_NOSPACE Pavel Begunkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.