All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] udp: do fwd memory scheduling on dequeue
@ 2016-10-28 13:20 ` Paolo Abeni
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2016-10-28 13:20 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

A new argument is added to __skb_recv_datagram, it allows
the caller to perform protocol specific action on dequeue
under the receive queue spinlock.
The UDP protocol uses such argument to perform fwd memory
reclaiming on dequeue, while protocol memory and rmem updatating
is delayed after the lock release, to keep the time spent
under the lock as low as possible.
The UDP specific skb desctructor is not used anymore, instead
explicit memory reclaiming is performed at close() time and
when skbs are removed from the receive queue.
The in kernel UDP procotol users now need to use an
skb_recv_udp() variant instead of skb_recv_datagram() to
properly perform memory accounting on dequeue.

Overall, this allows acquiring only once the receive queue
lock on dequeue.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr sinks	vanilla		patched
1		440		560
3		2150		2300
6		3650		3800
9		4450		4600
12		6250		6450

Suggested-by: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Acked-by: Hannes Frederic Sowa <hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org>
Signed-off-by: Paolo Abeni <pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/linux/skbuff.h |  4 ++++
 include/net/udp.h      | 10 ++++++++
 net/core/datagram.c    | 11 ++++++---
 net/ipv4/udp.c         | 65 ++++++++++++++++++++++++++++++++++++++++----------
 net/ipv6/udp.c         |  3 +--
 net/rxrpc/input.c      |  7 +++---
 net/sunrpc/svcsock.c   |  2 +-
 net/sunrpc/xprtsock.c  |  2 +-
 net/unix/af_unix.c     |  4 ++--
 9 files changed, 83 insertions(+), 25 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f..dd171a9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3028,13 +3028,17 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
+typedef void (*skb_dequeue_cb_t)(struct sock *sk, struct sk_buff *skb,
+				 int flags);
 
 int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
+					skb_dequeue_cb_t dequeue_cb,
 					int *peeked, int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+				    skb_dequeue_cb_t dequeue_cb,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
diff --git a/include/net/udp.h b/include/net/udp.h
index 18f1e6b..983c861 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -48,6 +48,7 @@ struct udp_skb_cb {
 	} header;
 	__u16		cscov;
 	__u8		partial_cov;
+	int		fwd_memory_released;
 };
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
@@ -248,6 +249,15 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock,
+			       int *peeked, int *off, int *err);
+static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
+					   int noblock, int *err)
+{
+	int peeked, off = 0;
+
+	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+}
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index bfb973a..226548b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -165,6 +165,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  *	__skb_try_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
  *	@flags: MSG_ flags
+ *	@dequeue_cb: invoked under the receive lock on successful dequeue
  *	@peeked: returns non-zero if this packet has been seen before
  *	@off: an offset in bytes to peek skb from. Returns an offset
  *	      within an skb where data actually starts
@@ -197,6 +198,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  *	the standard around please.
  */
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+					skb_dequeue_cb_t dequeue_cb,
 					int *peeked, int *off, int *err,
 					struct sk_buff **last)
 {
@@ -244,6 +246,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 			} else
 				__skb_unlink(skb, queue);
 
+			if (dequeue_cb)
+				dequeue_cb(sk, skb, flags);
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
@@ -262,6 +266,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 EXPORT_SYMBOL(__skb_try_recv_datagram);
 
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+				    skb_dequeue_cb_t dequeue_cb,
 				    int *peeked, int *off, int *err)
 {
 	struct sk_buff *skb, *last;
@@ -270,8 +275,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, dequeue_cb, peeked,
+					      off, err, &last);
 		if (skb)
 			return skb;
 
@@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
 	int peeked, off = 0;
 
 	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				   &peeked, &off, err);
+				   NULL, &peeked, &off, err);
 }
 EXPORT_SYMBOL(skb_recv_datagram);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c833271..2f1a727 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1172,26 +1172,61 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	return ret;
 }
 
+/* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
 	int amt;
 
 	atomic_sub(size, &sk->sk_rmem_alloc);
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
 	sk->sk_forward_alloc -= amt;
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-static void udp_rmem_free(struct sk_buff *skb)
+/* if we are not peeking the skb, reclaim fwd allocated memory;
+ * rmem and protocol memory updating is delayed outside the lock
+ */
+static void udp_dequeue(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	int amt;
+
+	if (flags & MSG_PEEK)
+		return;
+
+	sk->sk_forward_alloc += skb->truesize;
+	amt = (sk->sk_forward_alloc - 1) & ~(SK_MEM_QUANTUM - 1);
+	sk->sk_forward_alloc -= amt;
+	UDP_SKB_CB(skb)->fwd_memory_released = amt >> SK_MEM_QUANTUM_SHIFT;
+}
+
+/* complete the memory reclaiming started with udp_dequeue */
+static void __udp_rmem_release(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	int amt = UDP_SKB_CB(skb)->fwd_memory_released;
+
+	if (flags & MSG_PEEK)
+		return;
+
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	if (amt)
+		__sk_mem_reduce_allocated(sk, amt);
+}
+
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
+			       int noblock, int *peeked, int *off, int *err)
 {
-	udp_rmem_release(skb->sk, skb->truesize, 1);
+	struct sk_buff *skb;
+
+	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				  udp_dequeue, peeked, off, err);
+	if (skb)
+		__udp_rmem_release(sk, skb, flags);
+	return skb;
 }
+EXPORT_SYMBOL(__skb_recv_udp);
 
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -1230,7 +1265,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 
 	/* the skb owner in now the udp socket */
 	skb->sk = sk;
-	skb->destructor = udp_rmem_free;
 	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
@@ -1254,8 +1288,13 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 static void udp_destruct_sock(struct sock *sk)
 {
 	/* reclaim completely the forward allocated memory */
-	__skb_queue_purge(&sk->sk_receive_queue);
-	udp_rmem_release(sk, 0, 0);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		udp_rmem_release(sk, skb->truesize, 0);
+		kfree_skb(skb);
+	}
+
 	inet_sock_destruct(sk);
 }
 
@@ -1303,11 +1342,11 @@ static int first_packet_length(struct sock *sk)
 		atomic_inc(&sk->sk_drops);
 		__skb_unlink(skb, rcvq);
 		__skb_queue_tail(&list_kill, skb);
+		udp_rmem_release(sk, skb->truesize, 1);
+		kfree_skb(skb);
 	}
 	res = skb ? skb->len : -1;
 	spin_unlock_bh(&rcvq->lock);
-
-	__skb_queue_purge(&list_kill);
 	return res;
 }
 
@@ -1362,8 +1401,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
@@ -2583,6 +2621,9 @@ void __init udp_init(void)
 {
 	unsigned long limit;
 
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) >
+		     FIELD_SIZEOF(struct sk_buff, cb));
+
 	udp_table_init(&udp_table, "UDP");
 	limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 71963b2..273a806 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -343,8 +343,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 44fb8d8..4c36112 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1053,7 +1053,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	ASSERT(!irqs_disabled());
 
-	skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
+	skb = skb_recv_udp(udp_sk, 0, 1, &ret);
 	if (!skb) {
 		if (ret == -EAGAIN)
 			return;
@@ -1075,10 +1075,9 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
 
-	/* The socket buffer we have is owned by UDP, with UDP's data all over
-	 * it, but we really want our own data there.
+	/* The UDP protocol already released all skb resources;
+	 * we are free to add own data there.
 	 */
-	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
 	/* dig out the RxRPC connection details */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e2a55dc..78da4ae 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -547,7 +547,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
 			     0, 0, MSG_PEEK | MSG_DONTWAIT);
 	if (err >= 0)
-		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+		skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
 
 	if (skb == NULL) {
 		if (err != -EAGAIN) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1758665..7178d0a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1080,7 +1080,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 	if (sk == NULL)
 		goto out;
 	for (;;) {
-		skb = skb_recv_datagram(sk, 0, 1, &err);
+		skb = skb_recv_udp(sk, 0, 1, &err);
 		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
 			consume_skb(skb);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e..8762018 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2113,8 +2113,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
-		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+					      &err, &last);
 		if (skb)
 			break;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next] udp: do fwd memory scheduling on dequeue
@ 2016-10-28 13:20 ` Paolo Abeni
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2016-10-28 13:20 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs

A new argument is added to __skb_recv_datagram, it allows
the caller to perform protocol specific action on dequeue
under the receive queue spinlock.
The UDP protocol uses such argument to perform fwd memory
reclaiming on dequeue, while protocol memory and rmem updatating
is delayed after the lock release, to keep the time spent
under the lock as low as possible.
The UDP specific skb desctructor is not used anymore, instead
explicit memory reclaiming is performed at close() time and
when skbs are removed from the receive queue.
The in kernel UDP procotol users now need to use an
skb_recv_udp() variant instead of skb_recv_datagram() to
properly perform memory accounting on dequeue.

Overall, this allows acquiring only once the receive queue
lock on dequeue.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr sinks	vanilla		patched
1		440		560
3		2150		2300
6		3650		3800
9		4450		4600
12		6250		6450

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h |  4 ++++
 include/net/udp.h      | 10 ++++++++
 net/core/datagram.c    | 11 ++++++---
 net/ipv4/udp.c         | 65 ++++++++++++++++++++++++++++++++++++++++----------
 net/ipv6/udp.c         |  3 +--
 net/rxrpc/input.c      |  7 +++---
 net/sunrpc/svcsock.c   |  2 +-
 net/sunrpc/xprtsock.c  |  2 +-
 net/unix/af_unix.c     |  4 ++--
 9 files changed, 83 insertions(+), 25 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f..dd171a9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3028,13 +3028,17 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
+typedef void (*skb_dequeue_cb_t)(struct sock *sk, struct sk_buff *skb,
+				 int flags);
 
 int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
+					skb_dequeue_cb_t dequeue_cb,
 					int *peeked, int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+				    skb_dequeue_cb_t dequeue_cb,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
diff --git a/include/net/udp.h b/include/net/udp.h
index 18f1e6b..983c861 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -48,6 +48,7 @@ struct udp_skb_cb {
 	} header;
 	__u16		cscov;
 	__u8		partial_cov;
+	int		fwd_memory_released;
 };
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
@@ -248,6 +249,15 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock,
+			       int *peeked, int *off, int *err);
+static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
+					   int noblock, int *err)
+{
+	int peeked, off = 0;
+
+	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+}
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index bfb973a..226548b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -165,6 +165,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  *	__skb_try_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
  *	@flags: MSG_ flags
+ *	@dequeue_cb: invoked under the receive lock on successful dequeue
  *	@peeked: returns non-zero if this packet has been seen before
  *	@off: an offset in bytes to peek skb from. Returns an offset
  *	      within an skb where data actually starts
@@ -197,6 +198,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  *	the standard around please.
  */
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+					skb_dequeue_cb_t dequeue_cb,
 					int *peeked, int *off, int *err,
 					struct sk_buff **last)
 {
@@ -244,6 +246,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 			} else
 				__skb_unlink(skb, queue);
 
+			if (dequeue_cb)
+				dequeue_cb(sk, skb, flags);
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
@@ -262,6 +266,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 EXPORT_SYMBOL(__skb_try_recv_datagram);
 
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+				    skb_dequeue_cb_t dequeue_cb,
 				    int *peeked, int *off, int *err)
 {
 	struct sk_buff *skb, *last;
@@ -270,8 +275,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, dequeue_cb, peeked,
+					      off, err, &last);
 		if (skb)
 			return skb;
 
@@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
 	int peeked, off = 0;
 
 	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				   &peeked, &off, err);
+				   NULL, &peeked, &off, err);
 }
 EXPORT_SYMBOL(skb_recv_datagram);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c833271..2f1a727 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1172,26 +1172,61 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	return ret;
 }
 
+/* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
 	int amt;
 
 	atomic_sub(size, &sk->sk_rmem_alloc);
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
 	sk->sk_forward_alloc -= amt;
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-static void udp_rmem_free(struct sk_buff *skb)
+/* if we are not peeking the skb, reclaim fwd allocated memory;
+ * rmem and protocol memory updating is delayed outside the lock
+ */
+static void udp_dequeue(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	int amt;
+
+	if (flags & MSG_PEEK)
+		return;
+
+	sk->sk_forward_alloc += skb->truesize;
+	amt = (sk->sk_forward_alloc - 1) & ~(SK_MEM_QUANTUM - 1);
+	sk->sk_forward_alloc -= amt;
+	UDP_SKB_CB(skb)->fwd_memory_released = amt >> SK_MEM_QUANTUM_SHIFT;
+}
+
+/* complete the memory reclaiming started with udp_dequeue */
+static void __udp_rmem_release(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	int amt = UDP_SKB_CB(skb)->fwd_memory_released;
+
+	if (flags & MSG_PEEK)
+		return;
+
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	if (amt)
+		__sk_mem_reduce_allocated(sk, amt);
+}
+
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
+			       int noblock, int *peeked, int *off, int *err)
 {
-	udp_rmem_release(skb->sk, skb->truesize, 1);
+	struct sk_buff *skb;
+
+	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				  udp_dequeue, peeked, off, err);
+	if (skb)
+		__udp_rmem_release(sk, skb, flags);
+	return skb;
 }
+EXPORT_SYMBOL(__skb_recv_udp);
 
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -1230,7 +1265,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 
 	/* the skb owner in now the udp socket */
 	skb->sk = sk;
-	skb->destructor = udp_rmem_free;
 	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
@@ -1254,8 +1288,13 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 static void udp_destruct_sock(struct sock *sk)
 {
 	/* reclaim completely the forward allocated memory */
-	__skb_queue_purge(&sk->sk_receive_queue);
-	udp_rmem_release(sk, 0, 0);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		udp_rmem_release(sk, skb->truesize, 0);
+		kfree_skb(skb);
+	}
+
 	inet_sock_destruct(sk);
 }
 
@@ -1303,11 +1342,11 @@ static int first_packet_length(struct sock *sk)
 		atomic_inc(&sk->sk_drops);
 		__skb_unlink(skb, rcvq);
 		__skb_queue_tail(&list_kill, skb);
+		udp_rmem_release(sk, skb->truesize, 1);
+		kfree_skb(skb);
 	}
 	res = skb ? skb->len : -1;
 	spin_unlock_bh(&rcvq->lock);
-
-	__skb_queue_purge(&list_kill);
 	return res;
 }
 
@@ -1362,8 +1401,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
@@ -2583,6 +2621,9 @@ void __init udp_init(void)
 {
 	unsigned long limit;
 
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) >
+		     FIELD_SIZEOF(struct sk_buff, cb));
+
 	udp_table_init(&udp_table, "UDP");
 	limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 71963b2..273a806 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -343,8 +343,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 44fb8d8..4c36112 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1053,7 +1053,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	ASSERT(!irqs_disabled());
 
-	skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
+	skb = skb_recv_udp(udp_sk, 0, 1, &ret);
 	if (!skb) {
 		if (ret == -EAGAIN)
 			return;
@@ -1075,10 +1075,9 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
 
-	/* The socket buffer we have is owned by UDP, with UDP's data all over
-	 * it, but we really want our own data there.
+	/* The UDP protocol already released all skb resources;
+	 * we are free to add own data there.
 	 */
-	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
 	/* dig out the RxRPC connection details */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e2a55dc..78da4ae 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -547,7 +547,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
 			     0, 0, MSG_PEEK | MSG_DONTWAIT);
 	if (err >= 0)
-		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+		skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
 
 	if (skb == NULL) {
 		if (err != -EAGAIN) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1758665..7178d0a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1080,7 +1080,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 	if (sk == NULL)
 		goto out;
 	for (;;) {
-		skb = skb_recv_datagram(sk, 0, 1, &err);
+		skb = skb_recv_udp(sk, 0, 1, &err);
 		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
 			consume_skb(skb);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e..8762018 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2113,8 +2113,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
-		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+					      &err, &last);
 		if (skb)
 			break;
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-28 13:20 ` Paolo Abeni
  (?)
@ 2016-10-28 17:16 ` Eric Dumazet
  2016-10-28 17:50   ` Eric Dumazet
  -1 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2016-10-28 17:16 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs


Nice !

I was working on this as well and my implementation was somewhat
different.

I then stopped when I found the UDP checksum bug and was waiting for net
to be merged into net-next
( https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=10df8e6152c6c400a563a673e9956320bfce1871 )

On Fri, 2016-10-28 at 15:20 +0200, Paolo Abeni wrote:
> A new argument is added to __skb_recv_datagram, it allows
> the caller to perform protocol specific action on dequeue
> under the receive queue spinlock.
> The UDP protocol uses such argument to perform fwd memory
> reclaiming on dequeue, while protocol memory and rmem updatating
> is delayed after the lock release, to keep the time spent
> under the lock as low as possible.
> The UDP specific skb desctructor is not used anymore, instead
> explicit memory reclaiming is performed at close() time and
> when skbs are removed from the receive queue.
> The in kernel UDP procotol users now need to use an
> skb_recv_udp() variant instead of skb_recv_datagram() to
> properly perform memory accounting on dequeue.
> 
> Overall, this allows acquiring only once the receive queue
> lock on dequeue.
> 
> Tested using pktgen with random src port, 64 bytes packet,
> wire-speed on a 10G link as sender and udp_sink as the receiver,
> using an l4 tuple rxhash to stress the contention, and one or more
> udp_sink instances with reuseport.
> 
> nr sinks	vanilla		patched
> 1		440		560
> 3		2150		2300
> 6		3650		3800
> 9		4450		4600
> 12		6250		6450
> 
> Suggested-by: Eric Dumazet <edumazet@google.com>
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  include/linux/skbuff.h |  4 ++++
>  include/net/udp.h      | 10 ++++++++
>  net/core/datagram.c    | 11 ++++++---
>  net/ipv4/udp.c         | 65 ++++++++++++++++++++++++++++++++++++++++----------
>  net/ipv6/udp.c         |  3 +--
>  net/rxrpc/input.c      |  7 +++---
>  net/sunrpc/svcsock.c   |  2 +-
>  net/sunrpc/xprtsock.c  |  2 +-
>  net/unix/af_unix.c     |  4 ++--
>  9 files changed, 83 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 601258f..dd171a9 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3028,13 +3028,17 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
>  #define skb_walk_frags(skb, iter)	\
>  	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
>  
> +typedef void (*skb_dequeue_cb_t)(struct sock *sk, struct sk_buff *skb,
> +				 int flags);
>  
>  int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
>  				const struct sk_buff *skb);
>  struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
> +					skb_dequeue_cb_t dequeue_cb,
>  					int *peeked, int *off, int *err,
>  					struct sk_buff **last);
>  struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
> +				    skb_dequeue_cb_t dequeue_cb,
>  				    int *peeked, int *off, int *err);
>  struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
>  				  int *err);
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 18f1e6b..983c861 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -48,6 +48,7 @@ struct udp_skb_cb {
>  	} header;
>  	__u16		cscov;
>  	__u8		partial_cov;
> +	int		fwd_memory_released;

I was not adding this field.

>  };
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
>  
> @@ -248,6 +249,15 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
>  /* net/ipv4/udp.c */
>  void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
>  int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
> +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock,
> +			       int *peeked, int *off, int *err);
> +static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
> +					   int noblock, int *err)
> +{
> +	int peeked, off = 0;
> +
> +	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
> +}
>  
>  void udp_v4_early_demux(struct sk_buff *skb);
>  int udp_get_port(struct sock *sk, unsigned short snum,
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index bfb973a..226548b 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -165,6 +165,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
>   *	__skb_try_recv_datagram - Receive a datagram skbuff
>   *	@sk: socket
>   *	@flags: MSG_ flags
> + *	@dequeue_cb: invoked under the receive lock on successful dequeue
>   *	@peeked: returns non-zero if this packet has been seen before
>   *	@off: an offset in bytes to peek skb from. Returns an offset
>   *	      within an skb where data actually starts
> @@ -197,6 +198,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
>   *	the standard around please.
>   */
>  struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
> +					skb_dequeue_cb_t dequeue_cb,
>  					int *peeked, int *off, int *err,
>  					struct sk_buff **last)
>  {
> @@ -244,6 +246,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>  			} else
>  				__skb_unlink(skb, queue);
>  
> +			if (dequeue_cb)
> +				dequeue_cb(sk, skb, flags);

I was doing this only if the packet was unlinked few lines above.

>  			spin_unlock_irqrestore(&queue->lock, cpu_flags);
>  			*off = _off;
>  			return skb;
> @@ -262,6 +266,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>  EXPORT_SYMBOL(__skb_try_recv_datagram);
>  
>  struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
> +				    skb_dequeue_cb_t dequeue_cb,
>  				    int *peeked, int *off, int *err)
>  {
>  	struct sk_buff *skb, *last;
> @@ -270,8 +275,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
>  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>  
>  	do {
> -		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
> -					      &last);
> +		skb = __skb_try_recv_datagram(sk, flags, dequeue_cb, peeked,
> +					      off, err, &last);
>  		if (skb)
>  			return skb;
>  
> @@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
>  	int peeked, off = 0;
>  
>  	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
> -				   &peeked, &off, err);
> +				   NULL, &peeked, &off, err);
>  }
>  EXPORT_SYMBOL(skb_recv_datagram);
>  
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c833271..2f1a727 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1172,26 +1172,61 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
>  	return ret;
>  }
>  
> +/* fully reclaim rmem/fwd memory allocated for skb */
>  static void udp_rmem_release(struct sock *sk, int size, int partial)
>  {
>  	int amt;
>  
>  	atomic_sub(size, &sk->sk_rmem_alloc);
> -
> -	spin_lock_bh(&sk->sk_receive_queue.lock);
>  	sk->sk_forward_alloc += size;
>  	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
>  	sk->sk_forward_alloc -= amt;
> -	spin_unlock_bh(&sk->sk_receive_queue.lock);
>  
>  	if (amt)
>  		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
>  }
>  
> -static void udp_rmem_free(struct sk_buff *skb)
> +/* if we are not peeking the skb, reclaim fwd allocated memory;
> + * rmem and protocol memory updating is delayed outside the lock
> + */
> +static void udp_dequeue(struct sock *sk, struct sk_buff *skb, int flags)
> +{
> +	int amt;
> +
> +	if (flags & MSG_PEEK)
> +		return;
> +
> +	sk->sk_forward_alloc += skb->truesize;
> +	amt = (sk->sk_forward_alloc - 1) & ~(SK_MEM_QUANTUM - 1);
> +	sk->sk_forward_alloc -= amt;
> +	UDP_SKB_CB(skb)->fwd_memory_released = amt >> SK_MEM_QUANTUM_SHIFT;
> +}
> +
> +/* complete the memory reclaiming started with udp_dequeue */
> +static void __udp_rmem_release(struct sock *sk, struct sk_buff *skb, int flags)
> +{
> +	int amt = UDP_SKB_CB(skb)->fwd_memory_released;
> +
> +	if (flags & MSG_PEEK)
> +		return;
> +
> +	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
> +	if (amt)
> +		__sk_mem_reduce_allocated(sk, amt);
> +}
> +
> +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
> +			       int noblock, int *peeked, int *off, int *err)
>  {
> -	udp_rmem_release(skb->sk, skb->truesize, 1);
> +	struct sk_buff *skb;
> +
> +	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
> +				  udp_dequeue, peeked, off, err);
> +	if (skb)
> +		__udp_rmem_release(sk, skb, flags);
> +	return skb;
>  }
> +EXPORT_SYMBOL(__skb_recv_udp);
>  
>  int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  {
> @@ -1230,7 +1265,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  
>  	/* the skb owner in now the udp socket */
>  	skb->sk = sk;
> -	skb->destructor = udp_rmem_free;
>  	skb->dev = NULL;
>  	sock_skb_set_dropcount(sk, skb);
>  
> @@ -1254,8 +1288,13 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  static void udp_destruct_sock(struct sock *sk)
>  {
>  	/* reclaim completely the forward allocated memory */
> -	__skb_queue_purge(&sk->sk_receive_queue);
> -	udp_rmem_release(sk, 0, 0);
> +	struct sk_buff *skb;
> +
> +	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
> +		udp_rmem_release(sk, skb->truesize, 0);
> +		kfree_skb(skb);
> +	}
> +

I was using instead :

+       unsigned int total = 0;
+       struct sk_buff *skb;
+
+       while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+               total += skb->truesize;
+               kfree_skb(skb);
+       }
+       udp_rmem_release(sk, total, 0);


>  	inet_sock_destruct(sk);
>  }
>  
> @@ -1303,11 +1342,11 @@ static int first_packet_length(struct sock *sk)
>  		atomic_inc(&sk->sk_drops);
>  		__skb_unlink(skb, rcvq);
>  		__skb_queue_tail(&list_kill, skb);
> +		udp_rmem_release(sk, skb->truesize, 1);
> +		kfree_skb(skb);
>  	}
>  	res = skb ? skb->len : -1;
>  	spin_unlock_bh(&rcvq->lock);
> -

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-28 17:16 ` Eric Dumazet
@ 2016-10-28 17:50   ` Eric Dumazet
       [not found]     ` <1477677030.7065.250.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2016-10-28 17:50 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Fri, 2016-10-28 at 10:16 -0700, Eric Dumazet wrote:
> Nice !
> 
> I was working on this as well and my implementation was somewhat
> different.

This is my WIP

Note this can be split in two parts.

1) One adding struct sock *sk param to ip_cmsg_recv_offset()
 
   This was because I left skb->sk NULL for skbs stored in receive
queue.
   You chose instead to set skb->sk, which is unusual (check
skb_orphan() BUG_ON())

2) Udp changes.

Tell me what you think, thanks again !


diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f6e621..52e70431abc9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3033,9 +3033,13 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
 					int *peeked, int *off, int *err,
-					struct sk_buff **last);
+					struct sk_buff **last,
+					void (*destructor)(struct sock *sk,
+							   struct sk_buff *skb));
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
-				    int *peeked, int *off, int *err);
+				    int *peeked, int *off, int *err,
+				    void (*destructor)(struct sock *sk,
+						       struct sk_buff *skb));
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
 unsigned int datagram_poll(struct file *file, struct socket *sock,
diff --git a/include/net/ip.h b/include/net/ip.h
index 79b102ffbe16..f91fc376f3fb 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -572,7 +572,8 @@ int ip_options_rcv_srr(struct sk_buff *skb);
  */
 
 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset);
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+			 struct sk_buff *skb, int tlen, int offset);
 int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
 		 struct ipcm_cookie *ipc, bool allow_ipv6);
 int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -594,7 +595,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
 
 static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
-	ip_cmsg_recv_offset(msg, skb, 0, 0);
+	ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
 }
 
 bool icmp_global_allow(void);
diff --git a/include/net/udp.h b/include/net/udp.h
index 18f1e6b91927..0178f4552c4d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -248,6 +248,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index bfb973aebb5b..48228d15f33c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -198,7 +198,8 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  */
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 					int *peeked, int *off, int *err,
-					struct sk_buff **last)
+					struct sk_buff **last,
+					void (*destructor)(struct sock *sk, struct sk_buff *skb))
 {
 	struct sk_buff_head *queue = &sk->sk_receive_queue;
 	struct sk_buff *skb;
@@ -241,9 +242,11 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 				}
 
 				atomic_inc(&skb->users);
-			} else
+			} else {
 				__skb_unlink(skb, queue);
-
+				if (destructor)
+					destructor(sk, skb);
+			}
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
@@ -262,7 +265,9 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 EXPORT_SYMBOL(__skb_try_recv_datagram);
 
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
-				    int *peeked, int *off, int *err)
+				    int *peeked, int *off, int *err,
+				    void (*destructor)(struct sock *sk,
+						       struct sk_buff *skb))
 {
 	struct sk_buff *skb, *last;
 	long timeo;
@@ -271,7 +276,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 
 	do {
 		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
-					      &last);
+					      &last, destructor);
 		if (skb)
 			return skb;
 
@@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
 	int peeked, off = 0;
 
 	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				   &peeked, &off, err);
+				   &peeked, &off, err, NULL);
 }
 EXPORT_SYMBOL(skb_recv_datagram);
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63d1fb8..1de70870d0aa 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,11 +153,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
 }
 
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
-			 int tlen, int offset)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+			 struct sk_buff *skb, int tlen, int offset)
 {
-	struct inet_sock *inet = inet_sk(skb->sk);
-	unsigned int flags = inet->cmsg_flags;
+	unsigned int flags = inet_sk(sk)->cmsg_flags;
 
 	/* Ordered by supposed usage frequency */
 	if (flags & IP_CMSG_PKTINFO) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bfa9609ba0f4..8ff7ee74a992 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1178,20 +1178,20 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
 
 	atomic_sub(size, &sk->sk_rmem_alloc);
 
-	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
 	sk->sk_forward_alloc -= amt;
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-static void udp_rmem_free(struct sk_buff *skb)
+/* Note : called with sk_receive_queue.lock held */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-	udp_rmem_release(skb->sk, skb->truesize, 1);
+	udp_rmem_release(sk, skb->truesize, 1);
 }
+EXPORT_SYMBOL(udp_skb_destructor);
 
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -1220,17 +1220,14 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 		if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
 			err = -ENOBUFS;
 			spin_unlock(&list->lock);
-			goto uncharge_drop;
+			goto drop;
 		}
 
 		sk->sk_forward_alloc += delta;
 	}
-
+	atomic_add(size, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc -= size;
 
-	/* the skb owner in now the udp socket */
-	skb->sk = sk;
-	skb->destructor = udp_rmem_free;
 	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
@@ -1253,9 +1250,14 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
 
 static void udp_destruct_sock(struct sock *sk)
 {
-	/* reclaim completely the forward allocated memory */
-	__skb_queue_purge(&sk->sk_receive_queue);
-	udp_rmem_release(sk, 0, 0);
+	unsigned int total = 0;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		total += skb->truesize;
+		kfree_skb(skb);
+	}
+	udp_rmem_release(sk, total, 0);
 	inet_sock_destruct(sk);
 }
 
@@ -1287,12 +1289,11 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
  */
 static int first_packet_length(struct sock *sk)
 {
-	struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+	struct sk_buff_head *rcvq = &sk->sk_receive_queue;
 	struct sk_buff *skb;
+	int total = 0;
 	int res;
 
-	__skb_queue_head_init(&list_kill);
-
 	spin_lock_bh(&rcvq->lock);
 	while ((skb = skb_peek(rcvq)) != NULL &&
 		udp_lib_checksum_complete(skb)) {
@@ -1302,12 +1303,14 @@ static int first_packet_length(struct sock *sk)
 				IS_UDPLITE(sk));
 		atomic_inc(&sk->sk_drops);
 		__skb_unlink(skb, rcvq);
-		__skb_queue_tail(&list_kill, skb);
+		total += skb->truesize;
+		kfree_skb(skb);
 	}
 	res = skb ? skb->len : -1;
+	if (total)
+		udp_rmem_release(sk, total, 1);
 	spin_unlock_bh(&rcvq->lock);
 
-	__skb_queue_purge(&list_kill);
 	return res;
 }
 
@@ -1363,7 +1366,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+				  &peeked, &off, &err, udp_skb_destructor);
 	if (!skb)
 		return err;
 
@@ -1420,7 +1423,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		*addr_len = sizeof(*sin);
 	}
 	if (inet->cmsg_flags)
-		ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
+		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
 
 	err = copied;
 	if (flags & MSG_TRUNC)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a7700bbf6788..7e602b89c64d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -344,7 +344,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+				  &peeked, &off, &err, udp_skb_destructor);
 	if (!skb)
 		return err;
 
@@ -425,7 +425,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 	if (is_udp4) {
 		if (inet->cmsg_flags)
-			ip_cmsg_recv_offset(msg, skb,
+			ip_cmsg_recv_offset(msg, sk, skb,
 					    sizeof(struct udphdr), off);
 	} else {
 		if (np->rxopt.all)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e2ba36..8b995f88d000 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2114,7 +2114,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
-					      &last);
+					      &last, NULL);
 		if (skb)
 			break;
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-28 17:50   ` Eric Dumazet
@ 2016-10-29  8:17         ` Paolo Abeni
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2016-10-29  8:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Fri, 2016-10-28 at 10:50 -0700, Eric Dumazet wrote:
> On Fri, 2016-10-28 at 10:16 -0700, Eric Dumazet wrote:
> > Nice !
> > 
> > I was working on this as well and my implementation was somewhat
> > different.
> 
> This is my WIP
> 
> Note this can be split in two parts.
> 
> 1) One adding struct sock *sk param to ip_cmsg_recv_offset()
>  
>    This was because I left skb->sk NULL for skbs stored in receive
> queue.
>    You chose instead to set skb->sk, which is unusual (check
> skb_orphan() BUG_ON())
> 
> 2) Udp changes.
> 
> Tell me what you think, thanks again !

Thank you for working on this. 

I just gave a very quick look (the WE has started, children are
screaming ;-), overall the implementation seems quite similar to our
one.

I like the additional argument to  ip_cmsg_recv_offset() instead of
keeping skb->sk set.

If I read udp_skb_destructor() correctly, the atomic manipulation of
both sk_rmem_alloc and udp_memory_allocated will happen under the
receive lock. In our experiments this increment measurably the
contention on the lock in respect to moving said the operations outside
the lock (as done in our patch). Do you foreseen any issues with that ?
AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
updated with both implementation.

Cheers,

Paolo

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
@ 2016-10-29  8:17         ` Paolo Abeni
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2016-10-29  8:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Fri, 2016-10-28 at 10:50 -0700, Eric Dumazet wrote:
> On Fri, 2016-10-28 at 10:16 -0700, Eric Dumazet wrote:
> > Nice !
> > 
> > I was working on this as well and my implementation was somewhat
> > different.
> 
> This is my WIP
> 
> Note this can be split in two parts.
> 
> 1) One adding struct sock *sk param to ip_cmsg_recv_offset()
>  
>    This was because I left skb->sk NULL for skbs stored in receive
> queue.
>    You chose instead to set skb->sk, which is unusual (check
> skb_orphan() BUG_ON())
> 
> 2) Udp changes.
> 
> Tell me what you think, thanks again !

Thank you for working on this. 

I just gave a very quick look (the WE has started, children are
screaming ;-), overall the implementation seems quite similar to our
one.

I like the additional argument to  ip_cmsg_recv_offset() instead of
keeping skb->sk set.

If I read udp_skb_destructor() correctly, the atomic manipulation of
both sk_rmem_alloc and udp_memory_allocated will happen under the
receive lock. In our experiments this increment measurably the
contention on the lock in respect to moving said the operations outside
the lock (as done in our patch). Do you foreseen any issues with that ?
AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
updated with both implementation.

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-29  8:17         ` Paolo Abeni
@ 2016-10-29 12:43             ` Eric Dumazet
  -1 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2016-10-29 12:43 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Sat, 2016-10-29 at 10:17 +0200, Paolo Abeni wrote:

> Thank you for working on this. 
> 
> I just gave a very quick look (the WE has started, children are
> screaming ;-), overall the implementation seems quite similar to our
> one.
> 
> I like the additional argument to  ip_cmsg_recv_offset() instead of
> keeping skb->sk set.
> 
> If I read udp_skb_destructor() correctly, the atomic manipulation of
> both sk_rmem_alloc and udp_memory_allocated will happen under the
> receive lock. In our experiments this increment measurably the
> contention on the lock in respect to moving said the operations outside
> the lock (as done in our patch). Do you foreseen any issues with that ?
> AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
> updated with both implementation.

So if you look at tcp, we do not release forward allocation at every
recvmsg(), but rather when we are under tcp memory pressure, or at timer
firing when we know the flow has been idle for a while.

You hit contention on the lock, but the root cause is that right now udp
is very conservative and also hits false sharing on
udp_memory_allocated.

So I believe this is another problem which needs a fix anyway.

No need to make a complicated patch right now, if we know that this
problem will be separately fixed, in another patch ?



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
@ 2016-10-29 12:43             ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2016-10-29 12:43 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Sat, 2016-10-29 at 10:17 +0200, Paolo Abeni wrote:

> Thank you for working on this. 
> 
> I just gave a very quick look (the WE has started, children are
> screaming ;-), overall the implementation seems quite similar to our
> one.
> 
> I like the additional argument to  ip_cmsg_recv_offset() instead of
> keeping skb->sk set.
> 
> If I read udp_skb_destructor() correctly, the atomic manipulation of
> both sk_rmem_alloc and udp_memory_allocated will happen under the
> receive lock. In our experiments this increment measurably the
> contention on the lock in respect to moving said the operations outside
> the lock (as done in our patch). Do you foreseen any issues with that ?
> AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
> updated with both implementation.

So if you look at tcp, we do not release forward allocation at every
recvmsg(), but rather when we are under tcp memory pressure, or at timer
firing when we know the flow has been idle for a while.

You hit contention on the lock, but the root cause is that right now udp
is very conservative and also hits false sharing on
udp_memory_allocated.

So I believe this is another problem which needs a fix anyway.

No need to make a complicated patch right now, if we know that this
problem will be separately fixed, in another patch ?




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-29 12:43             ` Eric Dumazet
  (?)
@ 2016-10-31 15:02             ` Paolo Abeni
       [not found]               ` <1477926132.6655.10.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  -1 siblings, 1 reply; 11+ messages in thread
From: Paolo Abeni @ 2016-10-31 15:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Sat, 2016-10-29 at 05:43 -0700, Eric Dumazet wrote:
> On Sat, 2016-10-29 at 10:17 +0200, Paolo Abeni wrote:
> 
> > Thank you for working on this. 
> > 
> > I just gave a very quick look (the WE has started, children are
> > screaming ;-), overall the implementation seems quite similar to our
> > one.
> > 
> > I like the additional argument to  ip_cmsg_recv_offset() instead of
> > keeping skb->sk set.
> > 
> > If I read udp_skb_destructor() correctly, the atomic manipulation of
> > both sk_rmem_alloc and udp_memory_allocated will happen under the
> > receive lock. In our experiments this increment measurably the
> > contention on the lock in respect to moving said the operations outside
> > the lock (as done in our patch). Do you foreseen any issues with that ?
> > AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
> > updated with both implementation.
> 
> So if you look at tcp, we do not release forward allocation at every
> recvmsg(), but rather when we are under tcp memory pressure, or at timer
> firing when we know the flow has been idle for a while.
> 
> You hit contention on the lock, but the root cause is that right now udp
> is very conservative and also hits false sharing on
> udp_memory_allocated.
> 
> So I believe this is another problem which needs a fix anyway.
> 
> No need to make a complicated patch right now, if we know that this
> problem will be separately fixed, in another patch ?

No problem at all with incremental patches ;-)

In our experiment, touching udp_memory_allocated is only a part of the
the source of contention, with the biggest source of contention being
the sk_rmem_alloc update - which happens on every dequeue.

We experimented doing fwd alloc of the whole sk_rcvbuf; even in that
scenario we hit relevant contention if sk_rmem_alloc update was done
under the lock, while full sk_rcvbuf forward allocation and
sk_rmem_alloc update outside the spinlock gave very similar performance
to our posted patch.

I think that the next step (after the double lock on dequeue removal)
should be moving sk_rmem_alloc outside the lock: the needed changes for
doing that on top of double lock on dequeue removal are very small
(would add ~10 lines of code).

Paolo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
  2016-10-31 15:02             ` Paolo Abeni
@ 2016-10-31 15:16                   ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2016-10-31 15:16 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Mon, 2016-10-31 at 16:02 +0100, Paolo Abeni wrote:

> 
> No problem at all with incremental patches ;-)
> 
> In our experiment, touching udp_memory_allocated is only a part of the
> the source of contention, with the biggest source of contention being
> the sk_rmem_alloc update - which happens on every dequeue.
> 
> We experimented doing fwd alloc of the whole sk_rcvbuf; even in that
> scenario we hit relevant contention if sk_rmem_alloc update was done
> under the lock, while full sk_rcvbuf forward allocation and
> sk_rmem_alloc update outside the spinlock gave very similar performance
> to our posted patch.


> I think that the next step (after the double lock on dequeue removal)
> should be moving sk_rmem_alloc outside the lock: the needed changes for
> doing that on top of double lock on dequeue removal are very small
> (would add ~10 lines of code).
> 

During my load tests, one of the major factor was sk_drops being
incremented like crazy, dirtying a critical cache line, hurting
sk_filter reads for example.

        /* --- cacheline 6 boundary (384 bytes) --- */
        struct {
                atomic_t           rmem_alloc;           /* 0x180   0x4 */
                int                len;                  /* 0x184   0x4 */
                struct sk_buff *   head;                 /* 0x188   0x8 */
                struct sk_buff *   tail;                 /* 0x190   0x8 */
        } sk_backlog;                                    /* 0x180  0x18 */
        int                        sk_forward_alloc;     /* 0x198   0x4 */
        __u32                      sk_txhash;            /* 0x19c   0x4 */
        unsigned int               sk_napi_id;           /* 0x1a0   0x4 */
        unsigned int               sk_ll_usec;           /* 0x1a4   0x4 */
        atomic_t                   sk_drops;             /* 0x1a8   0x4 */
        int                        sk_rcvbuf;            /* 0x1ac   0x4 */
        struct sk_filter *         sk_filter;            /* 0x1b0   0x8 */
        union {
                struct socket_wq * sk_wq;                /*         0x8 */
                struct socket_wq * sk_wq_raw;            /*         0x8 */
        };                                               /* 0x1b8   0x8 */

I was playing moving this field elsewhere and not reading it if not
necessary.

diff --git a/include/net/sock.h b/include/net/sock.h
index f13ac87a8015cb18c5d3fe5fdcf2d6a0592428f4..a901df591eb45e153517cdb8b409b61563d1a4e3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2112,7 +2112,8 @@ struct sock_skb_cb {
 static inline void
 sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 {
-	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+	SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+					atomic_read(&sk->sk_drops) : 0;
 }
 
 static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
@ 2016-10-31 15:16                   ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2016-10-31 15:16 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Mon, 2016-10-31 at 16:02 +0100, Paolo Abeni wrote:

> 
> No problem at all with incremental patches ;-)
> 
> In our experiment, touching udp_memory_allocated is only a part of the
> the source of contention, with the biggest source of contention being
> the sk_rmem_alloc update - which happens on every dequeue.
> 
> We experimented doing fwd alloc of the whole sk_rcvbuf; even in that
> scenario we hit relevant contention if sk_rmem_alloc update was done
> under the lock, while full sk_rcvbuf forward allocation and
> sk_rmem_alloc update outside the spinlock gave very similar performance
> to our posted patch.


> I think that the next step (after the double lock on dequeue removal)
> should be moving sk_rmem_alloc outside the lock: the needed changes for
> doing that on top of double lock on dequeue removal are very small
> (would add ~10 lines of code).
> 

During my load tests, one of the major factor was sk_drops being
incremented like crazy, dirtying a critical cache line, hurting
sk_filter reads for example.

        /* --- cacheline 6 boundary (384 bytes) --- */
        struct {
                atomic_t           rmem_alloc;           /* 0x180   0x4 */
                int                len;                  /* 0x184   0x4 */
                struct sk_buff *   head;                 /* 0x188   0x8 */
                struct sk_buff *   tail;                 /* 0x190   0x8 */
        } sk_backlog;                                    /* 0x180  0x18 */
        int                        sk_forward_alloc;     /* 0x198   0x4 */
        __u32                      sk_txhash;            /* 0x19c   0x4 */
        unsigned int               sk_napi_id;           /* 0x1a0   0x4 */
        unsigned int               sk_ll_usec;           /* 0x1a4   0x4 */
        atomic_t                   sk_drops;             /* 0x1a8   0x4 */
        int                        sk_rcvbuf;            /* 0x1ac   0x4 */
        struct sk_filter *         sk_filter;            /* 0x1b0   0x8 */
        union {
                struct socket_wq * sk_wq;                /*         0x8 */
                struct socket_wq * sk_wq_raw;            /*         0x8 */
        };                                               /* 0x1b8   0x8 */

I was playing moving this field elsewhere and not reading it if not
necessary.

diff --git a/include/net/sock.h b/include/net/sock.h
index f13ac87a8015cb18c5d3fe5fdcf2d6a0592428f4..a901df591eb45e153517cdb8b409b61563d1a4e3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2112,7 +2112,8 @@ struct sock_skb_cb {
 static inline void
 sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 {
-	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+	SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+					atomic_read(&sk->sk_drops) : 0;
 }
 
 static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)




^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-10-31 15:16 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-28 13:20 [PATCH net-next] udp: do fwd memory scheduling on dequeue Paolo Abeni
2016-10-28 13:20 ` Paolo Abeni
2016-10-28 17:16 ` Eric Dumazet
2016-10-28 17:50   ` Eric Dumazet
     [not found]     ` <1477677030.7065.250.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>
2016-10-29  8:17       ` Paolo Abeni
2016-10-29  8:17         ` Paolo Abeni
     [not found]         ` <1477729045.5306.11.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2016-10-29 12:43           ` Eric Dumazet
2016-10-29 12:43             ` Eric Dumazet
2016-10-31 15:02             ` Paolo Abeni
     [not found]               ` <1477926132.6655.10.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2016-10-31 15:16                 ` Eric Dumazet
2016-10-31 15:16                   ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.