All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jesper Dangaard Brouer <brouer@redhat.com>
To: Eric Dumazet <eric.dumazet@gmail.com>,
	Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: netdev@vger.kernel.org, yoshfuji@linux-ipv6.org
Subject: RFC crap-patch [PATCH] net: Per CPU separate frag mem accounting
Date: Thu, 14 Mar 2013 08:25:55 +0100	[thread overview]
Message-ID: <1363245955.14913.21.camel@localhost> (raw)
In-Reply-To: <20130308221744.5312.14924.stgit@dragon>


His is NOT the patch I just mentioned in the other thread, of removing
the LRU list.  This patch does real per cpu mem acct, and LRU per CPU.

I get really good performance number with this patch, but I still think
this might not be the correct solution.

My current best results, which got applied recently, compared to this
patch:
 - Test-type:  Test-20G64K    Test-20G3F  20G64K+DoS   20G3F+DoS
 - Patch-06:   18486.7 Mbit/s  10723.20     3657.85     4560.64 Mbit/s
 - curr-best:  19041.0 Mbit/s  12105.20    10160.40    11179.30 Mbit/s

Thus, I have almost solved DoS effect Test-20G3F 12GBit/s -> 11Gbit/s
under DoS. The 64K+DoS case is not perfect yet, 19Gbit/s -> 11 Gbit/s.

--Jesper


On Fri, 2013-03-08 at 23:17 +0100, Jesper Dangaard Brouer wrote:
> ... testing if the percpu_counter does not scale in DoS situations
> ---
> 
>  include/net/inet_frag.h                 |   99 ++++++++++++++++++-------------
>  net/ipv4/inet_fragment.c                |   61 +++++++++++++++----
>  net/ipv4/ip_fragment.c                  |    3 +
>  net/ipv6/netfilter/nf_conntrack_reasm.c |    2 -
>  net/ipv6/reassembly.c                   |    2 -
>  5 files changed, 110 insertions(+), 57 deletions(-)
> 
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index f2b46a5..974434a 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -1,22 +1,31 @@
>  #ifndef __NET_FRAG_H__
>  #define __NET_FRAG_H__
>  
> -#include <linux/percpu_counter.h>
> +//#include <linux/percpu_counter.h>
> +#include <linux/spinlock.h>
> +#include <linux/atomic.h>
> +#include <linux/percpu.h>
>  
> -struct netns_frags {
> -	int			nqueues;
> -	struct list_head	lru_list;
> -	spinlock_t		lru_lock;
> +/* Need to maintain these resource limits per CPU, else we will kill
> + * performance due to cache-line bouncing
> + */
> +struct frag_cpu_limit {
> +	atomic_t                mem;
> +	struct list_head        lru_list;
> +	spinlock_t              lru_lock;
> +} ____cacheline_aligned_in_smp;
>  
> -	/* The percpu_counter "mem" need to be cacheline aligned.
> -	 *  mem.count must not share cacheline with other writers
> -	 */
> -	struct percpu_counter   mem ____cacheline_aligned_in_smp;
> +struct netns_frags {
>  
>  	/* sysctls */
>  	int			timeout;
>  	int			high_thresh;
>  	int			low_thresh;
> +
> +	struct frag_cpu_limit __percpu *percpu;
> +
> +	// TODO move "nqueues" elsewere...
> +	int			nqueues ____cacheline_aligned_in_smp;
>  };
>  
>  struct inet_frag_queue {
> @@ -25,6 +34,7 @@ struct inet_frag_queue {
>  	struct list_head	lru_list;   /* lru list member */
>  	struct hlist_node	list;
>  	atomic_t		refcnt;
> +	u32			cpu_alloc;  /* used for mem limit accounting */
>  	struct sk_buff		*fragments; /* list of received fragments */
>  	struct sk_buff		*fragments_tail;
>  	ktime_t			stamp;
> @@ -80,7 +90,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
>  void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
>  void inet_frag_destroy(struct inet_frag_queue *q,
>  				struct inet_frags *f, int *work);
> -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
> +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
> +		      bool force, int on_cpu);
>  struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
>  		struct inet_frags *f, void *key, unsigned int hash)
>  	__releases(&f->lock);
> @@ -93,59 +104,65 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
>  
>  /* Memory Tracking Functions. */
>  
> -/* The default percpu_counter batch size is not big enough to scale to
> - * fragmentation mem acct sizes.
> - * The mem size of a 64K fragment is approx:
> - *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
> - */
> -static unsigned int frag_percpu_counter_batch = 130000;
> -
> -static inline int frag_mem_limit(struct netns_frags *nf)
> -{
> -	return percpu_counter_read(&nf->mem);
> -}
> -
>  static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
>  {
> -	__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
> +	int cpu = q->cpu_alloc;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +	atomic_sub(i, &percpu->mem);
>  }
>  
>  static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
>  {
> -	__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
> -}
> -
> -static inline void init_frag_mem_limit(struct netns_frags *nf)
> -{
> -	percpu_counter_init(&nf->mem, 0);
> +	int cpu = q->cpu_alloc;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +	atomic_add(i, &percpu->mem);
>  }
>  
>  static inline int sum_frag_mem_limit(struct netns_frags *nf)
>  {
> -	return percpu_counter_sum_positive(&nf->mem);
> +	unsigned int sum = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> +		sum += atomic_read(&percpu->mem);
> +	}
> +	return sum;
>  }
>  
> +/* LRU (Least Recently Used) resource functions */
> +
>  static inline void inet_frag_lru_move(struct inet_frag_queue *q)
>  {
> -	spin_lock(&q->net->lru_lock);
> -	list_move_tail(&q->lru_list, &q->net->lru_list);
> -	spin_unlock(&q->net->lru_lock);
> +	int cpu = q->cpu_alloc;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +
> +	spin_lock(&percpu->lru_lock);
> +	list_move_tail(&q->lru_list, &percpu->lru_list);
> +	spin_unlock(&percpu->lru_lock);
>  }
>  
>  static inline void inet_frag_lru_del(struct inet_frag_queue *q)
>  {
> -	spin_lock(&q->net->lru_lock);
> -	list_del(&q->lru_list);
> -	q->net->nqueues--;
> -	spin_unlock(&q->net->lru_lock);
> +	int cpu = q->cpu_alloc;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +
> +	spin_lock(&percpu->lru_lock);
> + 	list_del(&q->lru_list);
> +	q->net->nqueues--; //FIXME
> +	spin_unlock(&percpu->lru_lock);
>  }
>  
>  static inline void inet_frag_lru_add(struct netns_frags *nf,
>  				     struct inet_frag_queue *q)
>  {
> -	spin_lock(&nf->lru_lock);
> -	list_add_tail(&q->lru_list, &nf->lru_list);
> -	q->net->nqueues++;
> -	spin_unlock(&nf->lru_lock);
> +	int cpu = q->cpu_alloc;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> +	spin_lock(&percpu->lru_lock);
> +	list_add_tail(&q->lru_list, &percpu->lru_list);
> +	q->net->nqueues++; //FIXME
> +	spin_unlock(&percpu->lru_lock);
>  }
>  #endif
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index e5c426f..f09fa7e 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -23,6 +23,18 @@
>  
>  #include <net/inet_frag.h>
>  
> +static inline int frag_mem_limit_on_cpu(struct netns_frags *nf, int on_cpu)
> +{
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, on_cpu);
> +	return atomic_read(&percpu->mem);
> +}
> +
> +static inline int frag_mem_limit(struct netns_frags *nf)
> +{
> +	int cpu = smp_processor_id();
> +	return frag_mem_limit_on_cpu(nf, cpu);
> +}
> +
>  static void inet_frag_secret_rebuild(unsigned long dummy)
>  {
>  	struct inet_frags *f = (struct inet_frags *)dummy;
> @@ -81,12 +93,28 @@ void inet_frags_init(struct inet_frags *f)
>  }
>  EXPORT_SYMBOL(inet_frags_init);
>  
> +static int inet_frags_init_percpu_limit(struct netns_frags *nf)
> +{
> +	int cpu;
> +
> +	nf->percpu = alloc_percpu(struct frag_cpu_limit);
> +	if (!nf->percpu)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> +		INIT_LIST_HEAD(&percpu->lru_list);
> +		spin_lock_init(&percpu->lru_lock);
> +		atomic_set(&percpu->mem, 0);
> +	}
> +	return 1;
> +}
> +
>  void inet_frags_init_net(struct netns_frags *nf)
>  {
>  	nf->nqueues = 0; //remove?
> -	init_frag_mem_limit(nf);
> -	INIT_LIST_HEAD(&nf->lru_list);
> -	spin_lock_init(&nf->lru_lock);
> +	inet_frags_init_percpu_limit(nf);
>  }
>  EXPORT_SYMBOL(inet_frags_init_net);
>  
> @@ -98,13 +126,16 @@ EXPORT_SYMBOL(inet_frags_fini);
>  
>  void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
>  {
> +	int cpu;
> +
>  	nf->low_thresh = 0;
>  
>  	local_bh_disable();
> -	inet_frag_evictor(nf, f, true);
> +	for_each_possible_cpu(cpu)
> +		inet_frag_evictor(nf, f, true, cpu);
>  	local_bh_enable();
>  
> -	percpu_counter_destroy(&nf->mem);
> +	free_percpu(nf->percpu);
>  }
>  EXPORT_SYMBOL(inet_frags_exit_net);
>  
> @@ -179,33 +210,36 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
>  }
>  EXPORT_SYMBOL(inet_frag_destroy);
>  
> -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
> +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
> +		      bool force, int on_cpu)
>  {
>  	struct inet_frag_queue *q;
>  	int work, evicted = 0;
> +	int cpu = (likely(on_cpu < 0)) ? smp_processor_id() : on_cpu;
> +	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
>  
>  	if (!force) {
> -		if (frag_mem_limit(nf) <= nf->high_thresh)
> +		if (frag_mem_limit_on_cpu(nf, cpu) <= nf->high_thresh)
>  			return 0;
>  	}
>  
> -	work = frag_mem_limit(nf) - nf->low_thresh;
> +	work = frag_mem_limit_on_cpu(nf, cpu) - nf->low_thresh;
>  	while (work > 0) {
> -		spin_lock(&nf->lru_lock);
> +		spin_lock(&percpu->lru_lock);
>  
> -		if (list_empty(&nf->lru_list)) {
> -			spin_unlock(&nf->lru_lock);
> +		if (list_empty(&percpu->lru_list)) {
> +			spin_unlock(&percpu->lru_lock);
>  			break;
>  		}
>  
> -		q = list_first_entry(&nf->lru_list,
> +		q = list_first_entry(&percpu->lru_list,
>  				struct inet_frag_queue, lru_list);
>  		atomic_inc(&q->refcnt);
>  
>  		// TEST: remove q from list to avoid more CPUs grabbing it
>  		list_del_init(&q->lru_list);
>  
> -		spin_unlock(&nf->lru_lock);
> +		spin_unlock(&percpu->lru_lock);
>  
>  		spin_lock(&q->lock);
>  		if (!(q->last_in & INET_FRAG_COMPLETE))
> @@ -283,6 +317,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>  		return NULL;
>  
>  	q->net = nf;
> +	q->cpu_alloc = (u32) smp_processor_id();
>  	f->constructor(q, arg);
>  	add_frag_mem_limit(q, f->qsize);
>  
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index 1211613..4241417 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -18,6 +18,7 @@
>   *		John McDonald	:	0 length frag bug.
>   *		Alexey Kuznetsov:	SMP races, threading, cleanup.
>   *		Patrick McHardy :	LRU queue of frag heads for evictor.
> + *		Jesper Brouer   :	SMP/NUMA scalability
>   */
>  
>  #define pr_fmt(fmt) "IPv4: " fmt
> @@ -212,7 +213,7 @@ static void ip_evictor(struct net *net)
>  {
>  	int evicted;
>  
> -	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
> +	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false, -1);
>  	if (evicted)
>  		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
>  }
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index c674f15..37291f9 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -569,7 +569,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
>  	fhdr = (struct frag_hdr *)skb_transport_header(clone);
>  
>  	local_bh_disable();
> -	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
> +	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false, -1);
>  	local_bh_enable();
>  
>  	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index bab2c27..d1e70dd 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -529,7 +529,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  		return 1;
>  	}
>  
> -	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
> +	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false, -1);
>  	if (evicted)
>  		IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
>  				 IPSTATS_MIB_REASMFAILS, evicted);
> 

       reply	other threads:[~2013-03-14  7:26 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20130308221647.5312.33631.stgit@dragon>
     [not found] ` <20130308221744.5312.14924.stgit@dragon>
2013-03-14  7:25   ` Jesper Dangaard Brouer [this message]
2013-03-14  8:59     ` RFC crap-patch [PATCH] net: Per CPU separate frag mem accounting Jesper Dangaard Brouer
2013-03-14 20:59       ` Ben Hutchings
2013-03-14 23:12         ` Hannes Frederic Sowa
2013-03-14 23:39           ` Ben Hutchings
2013-03-15 20:09             ` Hannes Frederic Sowa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1363245955.14913.21.camel@localhost \
    --to=brouer@redhat.com \
    --cc=eric.dumazet@gmail.com \
    --cc=hannes@stressinduktion.org \
    --cc=netdev@vger.kernel.org \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.