From mboxrd@z Thu Jan 1 00:00:00 1970 From: Paolo Abeni Subject: [PATCH net-next 2/2] udp: implement and use per cpu rx skbs cache Date: Wed, 18 Apr 2018 12:22:38 +0200 Message-ID: References: Cc: "David S. Miller" , Eric Dumazet To: netdev@vger.kernel.org Return-path: Received: from mx3-rdu2.redhat.com ([66.187.233.73]:35876 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1750861AbeDRKXX (ORCPT ); Wed, 18 Apr 2018 06:23:23 -0400 In-Reply-To: Sender: netdev-owner@vger.kernel.org List-ID: This changeset extends the idea behind commit c8c8b127091b ("udp: under rx pressure, try to condense skbs"), trading more BH cpu time and memory bandwidth to decrease the load on the user space receiver. At boot time we allocate a limited amount of skbs with small data buffer, storing them in per cpu arrays. Such skbs are never freed. At run time, under rx pressure, the BH tries to copy the current skb contents into the cache - if the current cache skb is available, and the ingress skb is small enough and without any head states. When using the cache skb, the ingress skb is dropped by the BH - while still hot on cache - and the cache skb is inserted into the rx queue, after increasing its usage count. Also, the cache array index is moved to the next entry. The receive side is unmodified: in udp_rcvmsg() the usage skb usage count is decreased and the skb is _not_ freed - since the cache keeps usage > 0. Since skb->usage is hot in the cache of the receiver at consume time - the receiver has just read skb->data, which lies in the same cacheline - the whole skb_consume_udp() becomes really cheap. UDP receive performances under flood improve as follow: NR RX queues Kpps Kpps Delta (%) Before After 1 2252 2305 2 2 2151 2569 19 4 2033 2396 17 8 1969 2329 18 Overall performances of knotd DNS server under real traffic flood improves as follow: Kpps Kpps Delta (%) Before After 3777 3981 5 Signed-off-by: Paolo Abeni -- Performances figures are with both PAGE_TABLE_ISOLATION and RETPOLINES enabled, this is way the baseline --- net/ipv4/udp.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3fb0fbf4977d..bb1879cd51b4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,6 +125,26 @@ EXPORT_SYMBOL(sysctl_udp_mem); atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); +struct skb_cache_entry { + int size; + int head; + struct sk_buff *skbs[0]; +}; + +static struct skb_cache_entry __percpu *skb_cache; + +/* Under socket memory pressure, small packets are copied to a percpu cache + * before enqueuing them, do decrease the load on the receiver process. + * To avoid excessive copy overhead we use a small skb size threshold. + * Each percpu cache should be able to cope with at least a socket under + * memory pressure. It doesn't need to handle many of them: if there are + * more than a few sockets under memory pressure, the user-space is most + * probably too lazy and there is no gain using the cache + */ +#define UDP_CACHE_MAX_SKB_LEN 512 +#define UDP_CACHE_MIN_SIZE _SK_MEM_PACKETS +#define UDP_CACHE_MAX_SIZE (_SK_MEM_PACKETS * 3) + #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) @@ -1246,6 +1266,82 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } +static inline struct sk_buff *udp_cache_get_skb(void) +{ + struct skb_cache_entry *cache; + struct sk_buff *skb; + + if (unlikely(!skb_cache)) + return NULL; + + cache = this_cpu_ptr(skb_cache); + skb = cache->skbs[cache->head]; + if (refcount_read(&skb->users) != 1) + return NULL; + + /* peeking with offset clones the queued skbs, we must check that all + * the cloned references are gone. + * This barrier is paried with the implicit one in skb_unref(), while + * decrementing skb->users. + */ + rmb(); + if (unlikely(skb->cloned)) { + if (atomic_read(&skb_shinfo(skb)->dataref) != 1) + return NULL; + skb->cloned = 0; + } + + cache->head++; + if (cache->head == cache->size) + cache->head = 0; + refcount_inc(&skb->users); + return skb; +} + +static bool udp_copy_to_cache(struct sk_buff **s) +{ + struct sk_buff *skb2, *skb = *s; + int hlen; + + /* check if we can copy the specified skb into the cache: data + l3 + + * l4 must be below the the cached skb size and no head states must + * be attached. + */ + hlen = skb_network_header_len(skb) + sizeof(struct udphdr); + if ((hlen + skb->len) >= UDP_CACHE_MAX_SKB_LEN || skb_sec_path(skb)) + return false; + + skb2 = udp_cache_get_skb(); + if (!skb2) + return false; + + /* copy the relevant header: we skip the head states - we know no state + * is attached to 'skb' - the unrelevant part of the CB, and + * skb->dev - will be overwritten later by udp_set_dev_scratch() + */ + skb2->tstamp = skb->tstamp; + *UDP_SKB_CB(skb2) = *UDP_SKB_CB(skb); + skb2->queue_mapping = skb->queue_mapping; + memcpy(&skb2->headers_start, &skb->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + + /* skip the mac header, we don't need it */ + skb_copy_bits(skb, -hlen, skb2->head, skb->len + hlen); + + /* override the relevant offsets: skb2 starts from the network hdr */ + skb2->transport_header = hlen - sizeof(struct udphdr); + skb2->network_header = 0; + skb2->mac_header = 0; + skb2->data = skb2->head + hlen; + skb_set_tail_pointer(skb2, skb->len); + skb2->len = skb->len; + consume_skb(skb); + + *s = skb2; + return true; +} + /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line @@ -1290,9 +1386,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) + * Additionally, processing skbs from the cache allows udp_recvmsg() + * to 'free' them with a single atomic operation on a hot cacheline */ if (rmem > (sk->sk_rcvbuf >> 1)) { - skb_condense(skb); + if (!udp_copy_to_cache(&skb)) + skb_condense(skb); busy = busylock_acquire(sk); } @@ -2858,6 +2957,64 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, }; +static void udp_free_cache(int nr) +{ + int i, cpu; + + for_each_possible_cpu(cpu) + for (i = 0; i < nr; ++i) + kfree_skb(per_cpu_ptr(skb_cache, cpu)->skbs[i]); + + free_percpu(skb_cache); + skb_cache = NULL; +} + +static void udp_init_cache(unsigned long max_size) +{ + size_t skb_guessed_size, per_cpu_size; + unsigned long total_size = 0; + struct sk_buff *skb; + int i, nr, cpu = 0; + + /* try to fill the cache only if we can allocate a reasonable number + * of skbs + */ + skb_guessed_size = SKB_TRUESIZE(UDP_CACHE_MAX_SKB_LEN); + nr = min_t(unsigned long, UDP_CACHE_MAX_SIZE, + max_size / (nr_cpu_ids * skb_guessed_size)); + if (nr < UDP_CACHE_MIN_SIZE) { + pr_info("low memory, UDP skbs cache will not be allocated\n"); + return; + } + + per_cpu_size = nr * sizeof(void *) + sizeof(struct skb_cache_entry); + skb_cache = __alloc_percpu_gfp(per_cpu_size, L1_CACHE_BYTES, + GFP_KERNEL | __GFP_ZERO); + if (!skb_cache) { + pr_warn("Can't allocate UDP skb cache\n"); + return; + } + + pr_info("allocating %d skbs on %d CPUs for rx cache\n", nr, nr_cpu_ids); + for (i = 0; i < nr && total_size < max_size; ++i) { + for_each_possible_cpu(cpu) { + skb = __alloc_skb(UDP_CACHE_MAX_SKB_LEN, GFP_KERNEL, + 0, cpu_to_node(cpu)); + if (!skb) { + pr_warn("allocation failure, cache disabled"); + udp_free_cache(nr); + return; + } + + total_size += skb->truesize; + per_cpu_ptr(skb_cache, cpu)->skbs[i] = skb; + } + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(skb_cache, cpu)->size = nr; +} + void __init udp_init(void) { unsigned long limit; @@ -2871,6 +3028,7 @@ void __init udp_init(void) sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; __udp_sysctl_init(&init_net); + udp_init_cache(sysctl_udp_mem[0] / 100 * PAGE_SIZE); /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; -- 2.14.3