[PATCH net-next 2/2] udp: implement and use per cpu rx skbs cache

From: Paolo Abeni <pabeni@redhat.com>
To: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <eric.dumazet@gmail.com>
Subject: [PATCH net-next 2/2] udp: implement and use per cpu rx skbs cache
Date: Wed, 18 Apr 2018 12:22:38 +0200	[thread overview]
Message-ID: <dacbc7a3626bb170629e02159ed2f90120f06382.1524045911.git.pabeni@redhat.com> (raw)
In-Reply-To: <cover.1524045911.git.pabeni@redhat.com>

This changeset extends the idea behind commit c8c8b127091b ("udp:
under rx pressure, try to condense skbs"), trading more BH cpu
time and memory bandwidth to decrease the load on the user space
receiver.

At boot time we allocate a limited amount of skbs with small
data buffer, storing them in per cpu arrays. Such skbs are never
freed.

At run time, under rx pressure, the BH tries to copy the current
skb contents into the cache - if the current cache skb is available,
and the ingress skb is small enough and without any head states.

When using the cache skb, the ingress skb is dropped by the BH
- while still hot on cache - and the cache skb is inserted into
the rx queue, after increasing its usage count. Also, the cache
array index is moved to the next entry.

The receive side is unmodified: in udp_rcvmsg() the usage skb
usage count is decreased and the skb is _not_ freed - since the
cache keeps usage > 0. Since skb->usage is hot in the cache of the
receiver at consume time - the receiver has just read skb->data,
which lies in the same cacheline - the whole skb_consume_udp() becomes
really cheap.

UDP receive performances under flood improve as follow:

NR RX queues	Kpps	Kpps	Delta (%)
		Before	After

1		2252	2305	2
2		2151	2569	19
4		2033	2396	17
8		1969	2329	18

Overall performances of knotd DNS server under real traffic flood
improves as follow:

		Kpps	Kpps	Delta (%)
		Before	After

		3777	3981	5

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
Performances figures are with both PAGE_TABLE_ISOLATION and
RETPOLINES enabled, this is way the baseline
---
 net/ipv4/udp.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3fb0fbf4977d..bb1879cd51b4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,6 +125,26 @@ EXPORT_SYMBOL(sysctl_udp_mem);
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
+struct skb_cache_entry {
+	int size;
+	int head;
+	struct sk_buff *skbs[0];
+};
+
+static struct skb_cache_entry __percpu *skb_cache;
+
+/* Under socket memory pressure, small packets are copied to a percpu cache
+ * before enqueuing them, do decrease the load on the receiver process.
+ * To avoid excessive copy overhead we use a small skb size threshold.
+ * Each percpu cache should be able to cope with at least a socket under
+ * memory pressure. It doesn't need to handle many of them: if there are
+ * more than a few sockets under memory pressure, the user-space is most
+ * probably too lazy and there is no gain using the cache
+ */
+#define UDP_CACHE_MAX_SKB_LEN		512
+#define UDP_CACHE_MIN_SIZE		_SK_MEM_PACKETS
+#define UDP_CACHE_MAX_SIZE		(_SK_MEM_PACKETS * 3)
+
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
@@ -1246,6 +1266,82 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
 	udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
 }
 
+static inline struct sk_buff *udp_cache_get_skb(void)
+{
+	struct skb_cache_entry *cache;
+	struct sk_buff *skb;
+
+	if (unlikely(!skb_cache))
+		return NULL;
+
+	cache = this_cpu_ptr(skb_cache);
+	skb = cache->skbs[cache->head];
+	if (refcount_read(&skb->users) != 1)
+		return NULL;
+
+	/* peeking with offset clones the queued skbs, we must check that all
+	 * the cloned references are gone.
+	 * This barrier is paried with the implicit one in skb_unref(), while
+	 * decrementing skb->users.
+	 */
+	rmb();
+	if (unlikely(skb->cloned)) {
+		if (atomic_read(&skb_shinfo(skb)->dataref) != 1)
+			return NULL;
+		skb->cloned = 0;
+	}
+
+	cache->head++;
+	if (cache->head == cache->size)
+		cache->head = 0;
+	refcount_inc(&skb->users);
+	return skb;
+}
+
+static bool udp_copy_to_cache(struct sk_buff **s)
+{
+	struct sk_buff *skb2, *skb = *s;
+	int hlen;
+
+	/* check if we can copy the specified skb into the cache: data + l3 +
+	 * l4 must be below the the cached skb size and no head states must
+	 * be attached.
+	 */
+	hlen = skb_network_header_len(skb) + sizeof(struct udphdr);
+	if ((hlen + skb->len) >= UDP_CACHE_MAX_SKB_LEN || skb_sec_path(skb))
+		return false;
+
+	skb2 = udp_cache_get_skb();
+	if (!skb2)
+		return false;
+
+	/* copy the relevant header: we skip the head states - we know no state
+	 * is attached to 'skb' - the unrelevant part of the CB, and
+	 * skb->dev - will be overwritten later by udp_set_dev_scratch()
+	 */
+	skb2->tstamp	    = skb->tstamp;
+	*UDP_SKB_CB(skb2)   = *UDP_SKB_CB(skb);
+	skb2->queue_mapping = skb->queue_mapping;
+	memcpy(&skb2->headers_start, &skb->headers_start,
+	       offsetof(struct sk_buff, headers_end) -
+	       offsetof(struct sk_buff, headers_start));
+
+	/* skip the mac header, we don't need it */
+	skb_copy_bits(skb, -hlen, skb2->head, skb->len + hlen);
+
+	/* override the relevant offsets: skb2 starts from the network hdr */
+	skb2->transport_header = hlen - sizeof(struct udphdr);
+	skb2->network_header  = 0;
+	skb2->mac_header = 0;
+	skb2->data = skb2->head + hlen;
+	skb_set_tail_pointer(skb2, skb->len);
+	skb2->len = skb->len;
+	consume_skb(skb);
+
+	*s = skb2;
+	return true;
+}
+
 /* Idea of busylocks is to let producers grab an extra spinlock
  * to relieve pressure on the receive_queue spinlock shared by consumer.
  * Under flood, this means that only one producer can be in line
@@ -1290,9 +1386,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	 * - Reduce memory overhead and thus increase receive queue capacity
 	 * - Less cache line misses at copyout() time
 	 * - Less work at consume_skb() (less alien page frag freeing)
+	 * Additionally, processing skbs from the cache allows udp_recvmsg()
+	 * to 'free' them with a single atomic operation on a hot cacheline
 	 */
 	if (rmem > (sk->sk_rcvbuf >> 1)) {
-		skb_condense(skb);
+		if (!udp_copy_to_cache(&skb))
+			skb_condense(skb);
 
 		busy = busylock_acquire(sk);
 	}
@@ -2858,6 +2957,64 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
 	.init	= udp_sysctl_init,
 };
 
+static void udp_free_cache(int nr)
+{
+	int i, cpu;
+
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < nr; ++i)
+			kfree_skb(per_cpu_ptr(skb_cache, cpu)->skbs[i]);
+
+	free_percpu(skb_cache);
+	skb_cache = NULL;
+}
+
+static void udp_init_cache(unsigned long max_size)
+{
+	size_t skb_guessed_size, per_cpu_size;
+	unsigned long total_size = 0;
+	struct sk_buff *skb;
+	int i, nr, cpu = 0;
+
+	/* try to fill the cache only if we can allocate a reasonable number
+	 * of skbs
+	 */
+	skb_guessed_size = SKB_TRUESIZE(UDP_CACHE_MAX_SKB_LEN);
+	nr = min_t(unsigned long, UDP_CACHE_MAX_SIZE,
+		   max_size / (nr_cpu_ids * skb_guessed_size));
+	if (nr < UDP_CACHE_MIN_SIZE) {
+		pr_info("low memory, UDP skbs cache will not be allocated\n");
+		return;
+	}
+
+	per_cpu_size = nr * sizeof(void *) + sizeof(struct skb_cache_entry);
+	skb_cache = __alloc_percpu_gfp(per_cpu_size, L1_CACHE_BYTES,
+				       GFP_KERNEL | __GFP_ZERO);
+	if (!skb_cache) {
+		pr_warn("Can't allocate UDP skb cache\n");
+		return;
+	}
+
+	pr_info("allocating %d skbs on %d CPUs for rx cache\n", nr, nr_cpu_ids);
+	for (i = 0; i < nr && total_size < max_size; ++i) {
+		for_each_possible_cpu(cpu) {
+			skb = __alloc_skb(UDP_CACHE_MAX_SKB_LEN, GFP_KERNEL,
+					  0, cpu_to_node(cpu));
+			if (!skb) {
+				pr_warn("allocation failure, cache disabled");
+				udp_free_cache(nr);
+				return;
+			}
+
+			total_size += skb->truesize;
+			per_cpu_ptr(skb_cache, cpu)->skbs[i] = skb;
+		}
+	}
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(skb_cache, cpu)->size = nr;
+}
+
 void __init udp_init(void)
 {
 	unsigned long limit;
@@ -2871,6 +3028,7 @@ void __init udp_init(void)
 	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
 
 	__udp_sysctl_init(&init_net);
+	udp_init_cache(sysctl_udp_mem[0] / 100 * PAGE_SIZE);
 
 	/* 16 spinlocks per cpu */
 	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
-- 
2.14.3