From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jesper Dangaard Brouer Subject: RFC crap-patch [PATCH] net: Per CPU separate frag mem accounting Date: Thu, 14 Mar 2013 08:25:55 +0100 Message-ID: <1363245955.14913.21.camel@localhost> References: <20130308221647.5312.33631.stgit@dragon> <20130308221744.5312.14924.stgit@dragon> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit Cc: netdev@vger.kernel.org, yoshfuji@linux-ipv6.org To: Eric Dumazet , Hannes Frederic Sowa Return-path: Received: from mx1.redhat.com ([209.132.183.28]:8582 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750945Ab3CNH0B (ORCPT ); Thu, 14 Mar 2013 03:26:01 -0400 In-Reply-To: <20130308221744.5312.14924.stgit@dragon> Sender: netdev-owner@vger.kernel.org List-ID: His is NOT the patch I just mentioned in the other thread, of removing the LRU list. This patch does real per cpu mem acct, and LRU per CPU. I get really good performance number with this patch, but I still think this might not be the correct solution. My current best results, which got applied recently, compared to this patch: - Test-type: Test-20G64K Test-20G3F 20G64K+DoS 20G3F+DoS - Patch-06: 18486.7 Mbit/s 10723.20 3657.85 4560.64 Mbit/s - curr-best: 19041.0 Mbit/s 12105.20 10160.40 11179.30 Mbit/s Thus, I have almost solved DoS effect Test-20G3F 12GBit/s -> 11Gbit/s under DoS. The 64K+DoS case is not perfect yet, 19Gbit/s -> 11 Gbit/s. --Jesper On Fri, 2013-03-08 at 23:17 +0100, Jesper Dangaard Brouer wrote: > ... testing if the percpu_counter does not scale in DoS situations > --- > > include/net/inet_frag.h | 99 ++++++++++++++++++------------- > net/ipv4/inet_fragment.c | 61 +++++++++++++++---- > net/ipv4/ip_fragment.c | 3 + > net/ipv6/netfilter/nf_conntrack_reasm.c | 2 - > net/ipv6/reassembly.c | 2 - > 5 files changed, 110 insertions(+), 57 deletions(-) > > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h > index f2b46a5..974434a 100644 > --- a/include/net/inet_frag.h > +++ b/include/net/inet_frag.h > @@ -1,22 +1,31 @@ > #ifndef __NET_FRAG_H__ > #define __NET_FRAG_H__ > > -#include > +//#include > +#include > +#include > +#include > > -struct netns_frags { > - int nqueues; > - struct list_head lru_list; > - spinlock_t lru_lock; > +/* Need to maintain these resource limits per CPU, else we will kill > + * performance due to cache-line bouncing > + */ > +struct frag_cpu_limit { > + atomic_t mem; > + struct list_head lru_list; > + spinlock_t lru_lock; > +} ____cacheline_aligned_in_smp; > > - /* The percpu_counter "mem" need to be cacheline aligned. > - * mem.count must not share cacheline with other writers > - */ > - struct percpu_counter mem ____cacheline_aligned_in_smp; > +struct netns_frags { > > /* sysctls */ > int timeout; > int high_thresh; > int low_thresh; > + > + struct frag_cpu_limit __percpu *percpu; > + > + // TODO move "nqueues" elsewere... > + int nqueues ____cacheline_aligned_in_smp; > }; > > struct inet_frag_queue { > @@ -25,6 +34,7 @@ struct inet_frag_queue { > struct list_head lru_list; /* lru list member */ > struct hlist_node list; > atomic_t refcnt; > + u32 cpu_alloc; /* used for mem limit accounting */ > struct sk_buff *fragments; /* list of received fragments */ > struct sk_buff *fragments_tail; > ktime_t stamp; > @@ -80,7 +90,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); > void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); > void inet_frag_destroy(struct inet_frag_queue *q, > struct inet_frags *f, int *work); > -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); > +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, > + bool force, int on_cpu); > struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, > struct inet_frags *f, void *key, unsigned int hash) > __releases(&f->lock); > @@ -93,59 +104,65 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f > > /* Memory Tracking Functions. */ > > -/* The default percpu_counter batch size is not big enough to scale to > - * fragmentation mem acct sizes. > - * The mem size of a 64K fragment is approx: > - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes > - */ > -static unsigned int frag_percpu_counter_batch = 130000; > - > -static inline int frag_mem_limit(struct netns_frags *nf) > -{ > - return percpu_counter_read(&nf->mem); > -} > - > static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i) > { > - __percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch); > + int cpu = q->cpu_alloc; > + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu); > + atomic_sub(i, &percpu->mem); > } > > static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) > { > - __percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch); > -} > - > -static inline void init_frag_mem_limit(struct netns_frags *nf) > -{ > - percpu_counter_init(&nf->mem, 0); > + int cpu = q->cpu_alloc; > + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu); > + atomic_add(i, &percpu->mem); > } > > static inline int sum_frag_mem_limit(struct netns_frags *nf) > { > - return percpu_counter_sum_positive(&nf->mem); > + unsigned int sum = 0; > + int cpu; > + > + for_each_possible_cpu(cpu) { > + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu); > + > + sum += atomic_read(&percpu->mem); > + } > + return sum; > } > > +/* LRU (Least Recently Used) resource functions */ > + > static inline void inet_frag_lru_move(struct inet_frag_queue *q) > { > - spin_lock(&q->net->lru_lock); > - list_move_tail(&q->lru_list, &q->net->lru_list); > - spin_unlock(&q->net->lru_lock); > + int cpu = q->cpu_alloc; > + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu); > + > + spin_lock(&percpu->lru_lock); > + list_move_tail(&q->lru_list, &percpu->lru_list); > + spin_unlock(&percpu->lru_lock); > } > > static inline void inet_frag_lru_del(struct inet_frag_queue *q) > { > - spin_lock(&q->net->lru_lock); > - list_del(&q->lru_list); > - q->net->nqueues--; > - spin_unlock(&q->net->lru_lock); > + int cpu = q->cpu_alloc; > + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu); > + > + spin_lock(&percpu->lru_lock); > + list_del(&q->lru_list); > + q->net->nqueues--; //FIXME > + spin_unlock(&percpu->lru_lock); > } > > static inline void inet_frag_lru_add(struct netns_frags *nf, > struct inet_frag_queue *q) > { > - spin_lock(&nf->lru_lock); > - list_add_tail(&q->lru_list, &nf->lru_list); > - q->net->nqueues++; > - spin_unlock(&nf->lru_lock); > + int cpu = q->cpu_alloc; > + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu); > + > + spin_lock(&percpu->lru_lock); > + list_add_tail(&q->lru_list, &percpu->lru_list); > + q->net->nqueues++; //FIXME > + spin_unlock(&percpu->lru_lock); > } > #endif > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c > index e5c426f..f09fa7e 100644 > --- a/net/ipv4/inet_fragment.c > +++ b/net/ipv4/inet_fragment.c > @@ -23,6 +23,18 @@ > > #include > > +static inline int frag_mem_limit_on_cpu(struct netns_frags *nf, int on_cpu) > +{ > + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, on_cpu); > + return atomic_read(&percpu->mem); > +} > + > +static inline int frag_mem_limit(struct netns_frags *nf) > +{ > + int cpu = smp_processor_id(); > + return frag_mem_limit_on_cpu(nf, cpu); > +} > + > static void inet_frag_secret_rebuild(unsigned long dummy) > { > struct inet_frags *f = (struct inet_frags *)dummy; > @@ -81,12 +93,28 @@ void inet_frags_init(struct inet_frags *f) > } > EXPORT_SYMBOL(inet_frags_init); > > +static int inet_frags_init_percpu_limit(struct netns_frags *nf) > +{ > + int cpu; > + > + nf->percpu = alloc_percpu(struct frag_cpu_limit); > + if (!nf->percpu) > + return -ENOMEM; > + > + for_each_possible_cpu(cpu) { > + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu); > + > + INIT_LIST_HEAD(&percpu->lru_list); > + spin_lock_init(&percpu->lru_lock); > + atomic_set(&percpu->mem, 0); > + } > + return 1; > +} > + > void inet_frags_init_net(struct netns_frags *nf) > { > nf->nqueues = 0; //remove? > - init_frag_mem_limit(nf); > - INIT_LIST_HEAD(&nf->lru_list); > - spin_lock_init(&nf->lru_lock); > + inet_frags_init_percpu_limit(nf); > } > EXPORT_SYMBOL(inet_frags_init_net); > > @@ -98,13 +126,16 @@ EXPORT_SYMBOL(inet_frags_fini); > > void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) > { > + int cpu; > + > nf->low_thresh = 0; > > local_bh_disable(); > - inet_frag_evictor(nf, f, true); > + for_each_possible_cpu(cpu) > + inet_frag_evictor(nf, f, true, cpu); > local_bh_enable(); > > - percpu_counter_destroy(&nf->mem); > + free_percpu(nf->percpu); > } > EXPORT_SYMBOL(inet_frags_exit_net); > > @@ -179,33 +210,36 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, > } > EXPORT_SYMBOL(inet_frag_destroy); > > -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) > +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, > + bool force, int on_cpu) > { > struct inet_frag_queue *q; > int work, evicted = 0; > + int cpu = (likely(on_cpu < 0)) ? smp_processor_id() : on_cpu; > + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu); > > if (!force) { > - if (frag_mem_limit(nf) <= nf->high_thresh) > + if (frag_mem_limit_on_cpu(nf, cpu) <= nf->high_thresh) > return 0; > } > > - work = frag_mem_limit(nf) - nf->low_thresh; > + work = frag_mem_limit_on_cpu(nf, cpu) - nf->low_thresh; > while (work > 0) { > - spin_lock(&nf->lru_lock); > + spin_lock(&percpu->lru_lock); > > - if (list_empty(&nf->lru_list)) { > - spin_unlock(&nf->lru_lock); > + if (list_empty(&percpu->lru_list)) { > + spin_unlock(&percpu->lru_lock); > break; > } > > - q = list_first_entry(&nf->lru_list, > + q = list_first_entry(&percpu->lru_list, > struct inet_frag_queue, lru_list); > atomic_inc(&q->refcnt); > > // TEST: remove q from list to avoid more CPUs grabbing it > list_del_init(&q->lru_list); > > - spin_unlock(&nf->lru_lock); > + spin_unlock(&percpu->lru_lock); > > spin_lock(&q->lock); > if (!(q->last_in & INET_FRAG_COMPLETE)) > @@ -283,6 +317,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, > return NULL; > > q->net = nf; > + q->cpu_alloc = (u32) smp_processor_id(); > f->constructor(q, arg); > add_frag_mem_limit(q, f->qsize); > > diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c > index 1211613..4241417 100644 > --- a/net/ipv4/ip_fragment.c > +++ b/net/ipv4/ip_fragment.c > @@ -18,6 +18,7 @@ > * John McDonald : 0 length frag bug. > * Alexey Kuznetsov: SMP races, threading, cleanup. > * Patrick McHardy : LRU queue of frag heads for evictor. > + * Jesper Brouer : SMP/NUMA scalability > */ > > #define pr_fmt(fmt) "IPv4: " fmt > @@ -212,7 +213,7 @@ static void ip_evictor(struct net *net) > { > int evicted; > > - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); > + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false, -1); > if (evicted) > IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); > } > diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c > index c674f15..37291f9 100644 > --- a/net/ipv6/netfilter/nf_conntrack_reasm.c > +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c > @@ -569,7 +569,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) > fhdr = (struct frag_hdr *)skb_transport_header(clone); > > local_bh_disable(); > - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); > + inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false, -1); > local_bh_enable(); > > fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr); > diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c > index bab2c27..d1e70dd 100644 > --- a/net/ipv6/reassembly.c > +++ b/net/ipv6/reassembly.c > @@ -529,7 +529,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) > return 1; > } > > - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); > + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false, -1); > if (evicted) > IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), > IPSTATS_MIB_REASMFAILS, evicted); >