All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
@ 2021-11-21 17:05 Florian Westphal
  2021-11-23 13:24 ` Pablo Neira Ayuso
  0 siblings, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2021-11-21 17:05 UTC (permalink / raw)
  To: netfilter-devel
  Cc: Florian Westphal, Karel Rericha, Shmulik Ladkani, Eyal Birger

as of commit 4608fdfc07e1
("netfilter: conntrack: collect all entries in one cycle")
conntrack gc was changed to run periodically every 2 minutes.

On systems where conntrack hash table is set to large value,
almost all evictions happen from gc worker rather than the packet
path due to hash table distribution.

This causes netlink event overflows when the events are collected.
This change exposes two sysctls:

1. gc interval (milliseconds, default: 2 minutes)
2. buckets per cycle (default: UINT_MAX / all)

This allows to increase the scan intervals but also to reduce bustiness
by switching to partial scans of the table for each cycle.

If scan is changed to partial mode, next cycle resumes with next bucket.

The defaults keep current behaviour.

Reported-by: Karel Rericha <karel@maxtel.cz>
Cc: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 This is an alternative to Eyals patch.
 gc_interval is in millseconds rather than seconds and
 new gc_buckets can be used to switch the gc behaviour to
 a partial scan.

 For example you could configure it to scan at most 100
 buckets every 10ms, which would scan about 10k entries/s.

 If you think the extra complexity of gc_buckets is unwanted
 I would suggest that Eyal submits a v3 with gc_interval in ms
 units.

 .../networking/nf_conntrack-sysctl.rst        | 13 ++++++++++
 include/net/netfilter/nf_conntrack.h          |  2 ++
 net/netfilter/nf_conntrack_core.c             | 25 ++++++++++++++-----
 net/netfilter/nf_conntrack_standalone.c       | 24 ++++++++++++++++++
 4 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 311128abb768..26767a495406 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -61,6 +61,19 @@ nf_conntrack_frag6_timeout - INTEGER (seconds)
 
 	Time to keep an IPv6 fragment in memory.
 
+nf_conntrack_gc_buckets - INTEGER
+        default 4294967295
+
+	Number of buckets to scan during one gc cycle.
+        If the value is less than nf_conntrack_buckets, gc will return
+        early and next cycle resumes at the next unscanned bucket.
+        Default is to scan entire table per cycle.
+
+nf_conntrack_gc_interval - INTEGER (milliseconds)
+        default 120000 (2 minutes)
+
+        Garbage collector Interval (in milliseconds).
+
 nf_conntrack_generic_timeout - INTEGER (seconds)
 	default 600
 
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index cc663c68ddc4..ebaf36917c36 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -313,6 +313,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
+extern unsigned long nf_conntrack_gc_interval;
+extern unsigned int nf_conntrack_gc_buckets;
 extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 054ee9d25efe..0c789ee65e71 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 struct conntrack_gc_work {
 	struct delayed_work	dwork;
 	u32			next_bucket;
+	u32			buckets;
 	bool			exiting;
 	bool			early_drop;
 };
@@ -83,6 +84,9 @@ static DEFINE_MUTEX(nf_conntrack_mutex);
 #define MIN_CHAINLEN	8u
 #define MAX_CHAINLEN	(32u - MIN_CHAINLEN)
 
+unsigned long __read_mostly nf_conntrack_gc_interval = GC_SCAN_INTERVAL;
+unsigned int __read_mostly nf_conntrack_gc_buckets = UINT_MAX;
+
 static struct conntrack_gc_work conntrack_gc_work;
 
 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@@ -1421,12 +1425,17 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 static void gc_worker(struct work_struct *work)
 {
 	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
+	unsigned long next_run = nf_conntrack_gc_interval;
 	unsigned int i, hashsz, nf_conntrack_max95 = 0;
-	unsigned long next_run = GC_SCAN_INTERVAL;
 	struct conntrack_gc_work *gc_work;
+	unsigned int buckets;
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
+	buckets = gc_work->buckets;
+	gc_work->buckets = 0;
+
 	i = gc_work->next_bucket;
+	gc_work->next_bucket = 0;
 	if (gc_work->early_drop)
 		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
@@ -1491,7 +1500,12 @@ static void gc_worker(struct work_struct *work)
 		cond_resched();
 		i++;
 
+		if (++buckets >= nf_conntrack_gc_buckets) {
+			gc_work->next_bucket = i;
+			break;
+		}
 		if (time_after(jiffies, end_time) && i < hashsz) {
+			gc_work->buckets = buckets;
 			gc_work->next_bucket = i;
 			next_run = 0;
 			break;
@@ -1508,16 +1522,15 @@ static void gc_worker(struct work_struct *work)
 	 * This worker is only here to reap expired entries when system went
 	 * idle after a busy period.
 	 */
-	if (next_run) {
+	if (next_run)
 		gc_work->early_drop = false;
-		gc_work->next_bucket = 0;
-	}
+
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
-	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
+	INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
 	gc_work->exiting = false;
 }
 
@@ -2743,7 +2756,7 @@ int nf_conntrack_init_start(void)
 		goto err_proto;
 
 	conntrack_gc_work_init(&conntrack_gc_work);
-	queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+	queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, 10 * HZ);
 
 	return 0;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 80f675d884b2..38c9d0a3c898 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -554,6 +554,8 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_MAX,
 	NF_SYSCTL_CT_COUNT,
 	NF_SYSCTL_CT_BUCKETS,
+	NF_SYSCTL_CT_GC_BUCKETS,
+	NF_SYSCTL_CT_GC_INTERVAL,
 	NF_SYSCTL_CT_CHECKSUM,
 	NF_SYSCTL_CT_LOG_INVALID,
 	NF_SYSCTL_CT_EXPECT_MAX,
@@ -624,6 +626,9 @@ enum nf_ct_sysctl_index {
 
 #define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1)
 
+static const unsigned long max_scan_interval = 1 * 24 * 60 * 60 * HZ;
+static const unsigned long min_scan_interval = 1;
+
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
@@ -645,6 +650,23 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode           = 0644,
 		.proc_handler   = nf_conntrack_hash_sysctl,
 	},
+	[NF_SYSCTL_CT_GC_BUCKETS] = {
+		.procname       = "nf_conntrack_gc_buckets",
+		.data           = &nf_conntrack_gc_buckets,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ONE,
+	},
+	[NF_SYSCTL_CT_GC_INTERVAL] = {
+		.procname       = "nf_conntrack_gc_interval",
+		.data           = &nf_conntrack_gc_interval,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&min_scan_interval,
+		.extra2		= (void *)&max_scan_interval,
+	},
 	[NF_SYSCTL_CT_CHECKSUM] = {
 		.procname	= "nf_conntrack_checksum",
 		.data		= &init_net.ct.sysctl_checksum,
@@ -1123,6 +1145,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+		table[NF_SYSCTL_CT_GC_BUCKETS].mode = 0444;
+		table[NF_SYSCTL_CT_GC_INTERVAL].mode = 0444;
 	}
 
 	cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-21 17:05 [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior Florian Westphal
@ 2021-11-23 13:24 ` Pablo Neira Ayuso
  2021-11-23 13:30   ` Florian Westphal
  2021-11-23 14:01   ` Eyal Birger
  0 siblings, 2 replies; 9+ messages in thread
From: Pablo Neira Ayuso @ 2021-11-23 13:24 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netfilter-devel, Karel Rericha, Shmulik Ladkani, Eyal Birger

Hi,

On Sun, Nov 21, 2021 at 06:05:14PM +0100, Florian Westphal wrote:
> as of commit 4608fdfc07e1
> ("netfilter: conntrack: collect all entries in one cycle")
> conntrack gc was changed to run periodically every 2 minutes.
> 
> On systems where conntrack hash table is set to large value,
> almost all evictions happen from gc worker rather than the packet
> path due to hash table distribution.
> 
> This causes netlink event overflows when the events are collected.

If the issue is netlink, it should be possible to batch netlink
events.

> This change exposes two sysctls:
> 
> 1. gc interval (milliseconds, default: 2 minutes)
> 2. buckets per cycle (default: UINT_MAX / all)
> 
> This allows to increase the scan intervals but also to reduce bustiness
> by switching to partial scans of the table for each cycle.

Is there a way to apply autotuning? I know, this question might be
hard, but when does the user has update this new toggle? And do we
know what value should be placed here?

@Eyal: What gc interval you selected for your setup to address this
issue? You mentioned a lot of UDP short-lived flows, correct?

> If scan is changed to partial mode, next cycle resumes with next bucket.
> 
> The defaults keep current behaviour.
> 
> Reported-by: Karel Rericha <karel@maxtel.cz>
> Cc: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> Cc: Eyal Birger <eyal.birger@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  This is an alternative to Eyals patch.
>  gc_interval is in millseconds rather than seconds and
>  new gc_buckets can be used to switch the gc behaviour to
>  a partial scan.
> 
>  For example you could configure it to scan at most 100
>  buckets every 10ms, which would scan about 10k entries/s.
> 
>  If you think the extra complexity of gc_buckets is unwanted
>  I would suggest that Eyal submits a v3 with gc_interval in ms
>  units.
> 
>  .../networking/nf_conntrack-sysctl.rst        | 13 ++++++++++
>  include/net/netfilter/nf_conntrack.h          |  2 ++
>  net/netfilter/nf_conntrack_core.c             | 25 ++++++++++++++-----
>  net/netfilter/nf_conntrack_standalone.c       | 24 ++++++++++++++++++
>  4 files changed, 58 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
> index 311128abb768..26767a495406 100644
> --- a/Documentation/networking/nf_conntrack-sysctl.rst
> +++ b/Documentation/networking/nf_conntrack-sysctl.rst
> @@ -61,6 +61,19 @@ nf_conntrack_frag6_timeout - INTEGER (seconds)
>  
>  	Time to keep an IPv6 fragment in memory.
>  
> +nf_conntrack_gc_buckets - INTEGER
> +        default 4294967295
> +
> +	Number of buckets to scan during one gc cycle.
> +        If the value is less than nf_conntrack_buckets, gc will return
> +        early and next cycle resumes at the next unscanned bucket.
> +        Default is to scan entire table per cycle.
> +
> +nf_conntrack_gc_interval - INTEGER (milliseconds)
> +        default 120000 (2 minutes)
> +
> +        Garbage collector Interval (in milliseconds).
> +
>  nf_conntrack_generic_timeout - INTEGER (seconds)
>  	default 600
>  
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index cc663c68ddc4..ebaf36917c36 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -313,6 +313,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
>  extern struct hlist_nulls_head *nf_conntrack_hash;
>  extern unsigned int nf_conntrack_htable_size;
>  extern seqcount_spinlock_t nf_conntrack_generation;
> +extern unsigned long nf_conntrack_gc_interval;
> +extern unsigned int nf_conntrack_gc_buckets;
>  extern unsigned int nf_conntrack_max;
>  
>  /* must be called with rcu read lock held */
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 054ee9d25efe..0c789ee65e71 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
>  struct conntrack_gc_work {
>  	struct delayed_work	dwork;
>  	u32			next_bucket;
> +	u32			buckets;
>  	bool			exiting;
>  	bool			early_drop;
>  };
> @@ -83,6 +84,9 @@ static DEFINE_MUTEX(nf_conntrack_mutex);
>  #define MIN_CHAINLEN	8u
>  #define MAX_CHAINLEN	(32u - MIN_CHAINLEN)
>  
> +unsigned long __read_mostly nf_conntrack_gc_interval = GC_SCAN_INTERVAL;
> +unsigned int __read_mostly nf_conntrack_gc_buckets = UINT_MAX;
> +
>  static struct conntrack_gc_work conntrack_gc_work;
>  
>  void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
> @@ -1421,12 +1425,17 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
>  static void gc_worker(struct work_struct *work)
>  {
>  	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
> +	unsigned long next_run = nf_conntrack_gc_interval;
>  	unsigned int i, hashsz, nf_conntrack_max95 = 0;
> -	unsigned long next_run = GC_SCAN_INTERVAL;
>  	struct conntrack_gc_work *gc_work;
> +	unsigned int buckets;
>  	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
>  
> +	buckets = gc_work->buckets;
> +	gc_work->buckets = 0;
> +
>  	i = gc_work->next_bucket;
> +	gc_work->next_bucket = 0;
>  	if (gc_work->early_drop)
>  		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
>  
> @@ -1491,7 +1500,12 @@ static void gc_worker(struct work_struct *work)
>  		cond_resched();
>  		i++;
>  
> +		if (++buckets >= nf_conntrack_gc_buckets) {
> +			gc_work->next_bucket = i;
> +			break;
> +		}
>  		if (time_after(jiffies, end_time) && i < hashsz) {
> +			gc_work->buckets = buckets;
>  			gc_work->next_bucket = i;
>  			next_run = 0;
>  			break;
> @@ -1508,16 +1522,15 @@ static void gc_worker(struct work_struct *work)
>  	 * This worker is only here to reap expired entries when system went
>  	 * idle after a busy period.
>  	 */
> -	if (next_run) {
> +	if (next_run)
>  		gc_work->early_drop = false;
> -		gc_work->next_bucket = 0;
> -	}
> +
>  	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
>  }
>  
>  static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
>  {
> -	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
> +	INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
>  	gc_work->exiting = false;
>  }
>  
> @@ -2743,7 +2756,7 @@ int nf_conntrack_init_start(void)
>  		goto err_proto;
>  
>  	conntrack_gc_work_init(&conntrack_gc_work);
> -	queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
> +	queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, 10 * HZ);
>  
>  	return 0;
>  
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 80f675d884b2..38c9d0a3c898 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -554,6 +554,8 @@ enum nf_ct_sysctl_index {
>  	NF_SYSCTL_CT_MAX,
>  	NF_SYSCTL_CT_COUNT,
>  	NF_SYSCTL_CT_BUCKETS,
> +	NF_SYSCTL_CT_GC_BUCKETS,
> +	NF_SYSCTL_CT_GC_INTERVAL,
>  	NF_SYSCTL_CT_CHECKSUM,
>  	NF_SYSCTL_CT_LOG_INVALID,
>  	NF_SYSCTL_CT_EXPECT_MAX,
> @@ -624,6 +626,9 @@ enum nf_ct_sysctl_index {
>  
>  #define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1)
>  
> +static const unsigned long max_scan_interval = 1 * 24 * 60 * 60 * HZ;
> +static const unsigned long min_scan_interval = 1;
> +
>  static struct ctl_table nf_ct_sysctl_table[] = {
>  	[NF_SYSCTL_CT_MAX] = {
>  		.procname	= "nf_conntrack_max",
> @@ -645,6 +650,23 @@ static struct ctl_table nf_ct_sysctl_table[] = {
>  		.mode           = 0644,
>  		.proc_handler   = nf_conntrack_hash_sysctl,
>  	},
> +	[NF_SYSCTL_CT_GC_BUCKETS] = {
> +		.procname       = "nf_conntrack_gc_buckets",
> +		.data           = &nf_conntrack_gc_buckets,
> +		.maxlen         = sizeof(unsigned int),
> +		.mode           = 0644,
> +		.proc_handler	= proc_douintvec_minmax,
> +		.extra1		= SYSCTL_ONE,
> +	},
> +	[NF_SYSCTL_CT_GC_INTERVAL] = {
> +		.procname       = "nf_conntrack_gc_interval",
> +		.data           = &nf_conntrack_gc_interval,
> +		.maxlen         = sizeof(unsigned long),
> +		.mode           = 0644,
> +		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
> +		.extra1		= (void *)&min_scan_interval,
> +		.extra2		= (void *)&max_scan_interval,
> +	},
>  	[NF_SYSCTL_CT_CHECKSUM] = {
>  		.procname	= "nf_conntrack_checksum",
>  		.data		= &init_net.ct.sysctl_checksum,
> @@ -1123,6 +1145,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>  		table[NF_SYSCTL_CT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
> +		table[NF_SYSCTL_CT_GC_BUCKETS].mode = 0444;
> +		table[NF_SYSCTL_CT_GC_INTERVAL].mode = 0444;
>  	}
>  
>  	cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table);
> -- 
> 2.32.0
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-23 13:24 ` Pablo Neira Ayuso
@ 2021-11-23 13:30   ` Florian Westphal
  2021-11-30 21:31     ` Pablo Neira Ayuso
  2021-11-23 14:01   ` Eyal Birger
  1 sibling, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2021-11-23 13:30 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, netfilter-devel, Karel Rericha,
	Shmulik Ladkani, Eyal Birger

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> Hi,
> 
> On Sun, Nov 21, 2021 at 06:05:14PM +0100, Florian Westphal wrote:
> > as of commit 4608fdfc07e1
> > ("netfilter: conntrack: collect all entries in one cycle")
> > conntrack gc was changed to run periodically every 2 minutes.
> > 
> > On systems where conntrack hash table is set to large value,
> > almost all evictions happen from gc worker rather than the packet
> > path due to hash table distribution.
> > 
> > This causes netlink event overflows when the events are collected.
> 
> If the issue is netlink, it should be possible to batch netlink
> events.

I do not see how.

> > 1. gc interval (milliseconds, default: 2 minutes)
> > 2. buckets per cycle (default: UINT_MAX / all)
> > 
> > This allows to increase the scan intervals but also to reduce bustiness
> > by switching to partial scans of the table for each cycle.
> 
> Is there a way to apply autotuning? I know, this question might be
> hard, but when does the user has update this new toggle?

Whenever you need to timely delivery of events, or you need timely
reaping of outdated entries.

And we can't increase scan frequency because that will cause
more wakeups on otherwise idle systems, that was the entire reason
for going with 2m.

> And do we
> know what value should be placed here?

I tried, did not work out (see history of gc worker).

Only alternative i see is to give up and revert back to
per ct-timers.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-23 13:24 ` Pablo Neira Ayuso
  2021-11-23 13:30   ` Florian Westphal
@ 2021-11-23 14:01   ` Eyal Birger
  2021-11-24  9:17     ` Karel Rericha
  1 sibling, 1 reply; 9+ messages in thread
From: Eyal Birger @ 2021-11-23 14:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, netfilter-devel, Karel Rericha, Shmulik Ladkani

Hi Pablo,

On Tue, Nov 23, 2021 at 3:24 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> Hi,
>
> On Sun, Nov 21, 2021 at 06:05:14PM +0100, Florian Westphal wrote:
> > as of commit 4608fdfc07e1
> > ("netfilter: conntrack: collect all entries in one cycle")
> > conntrack gc was changed to run periodically every 2 minutes.
> >
> > On systems where conntrack hash table is set to large value,
> > almost all evictions happen from gc worker rather than the packet
> > path due to hash table distribution.
> >
> > This causes netlink event overflows when the events are collected.
>
> If the issue is netlink, it should be possible to batch netlink
> events.
>
> > This change exposes two sysctls:
> >
> > 1. gc interval (milliseconds, default: 2 minutes)
> > 2. buckets per cycle (default: UINT_MAX / all)
> >
> > This allows to increase the scan intervals but also to reduce bustiness
> > by switching to partial scans of the table for each cycle.
>
> Is there a way to apply autotuning? I know, this question might be
> hard, but when does the user has update this new toggle? And do we
> know what value should be placed here?
>
> @Eyal: What gc interval you selected for your setup to address this
> issue? You mentioned a lot of UDP short-lived flows, correct?

Yes, we have a lot of short lived UDP flows related to a DNS server service.

We collect flow termination events using ulogd and forward them as JSON
messages over UDP to fluentd. Since these flows are reaped every 2 minutes,
we see spikes in UDP rx drops due to fluentd not keeping up with the bursts.

We planned to configure this to run every 10s or so, which should be
sufficient for our workloads, and monitor these spikes in order to tune
further as needed.

Eyal.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-23 14:01   ` Eyal Birger
@ 2021-11-24  9:17     ` Karel Rericha
  0 siblings, 0 replies; 9+ messages in thread
From: Karel Rericha @ 2021-11-24  9:17 UTC (permalink / raw)
  To: Eyal Birger
  Cc: Pablo Neira Ayuso, Florian Westphal, netfilter-devel, Shmulik Ladkani

If I may add myself to discussion:

Florian's tunable solution is good enough for both worlds: setting
high gc interval will make idle VMs not waking up unnecessarily often
and low gc interval will prevent dropping conntrack events for systems
with busy very large conntrack tables which acquire conntrack events
through netlink.

I'm not sure about the default value though. Two minutes means
dropping events for some systems, i.e. breaking functionality compared
to previous gc solution. For VMs a few more waking ups don't break
anything I would guess (except for a little higher load). So a good
default would be returning back to hundreds of milliseconds or at
least to seconds. Two minutes are causing dropping conntrack events
even for 100MB netlink socket buffer here (several thousands events
per second, conntrack max 1M, hash table size 1M).

Karel

út 23. 11. 2021 v 15:01 odesílatel Eyal Birger <eyal.birger@gmail.com> napsal:
>
> Hi Pablo,
>
> On Tue, Nov 23, 2021 at 3:24 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Hi,
> >
> > On Sun, Nov 21, 2021 at 06:05:14PM +0100, Florian Westphal wrote:
> > > as of commit 4608fdfc07e1
> > > ("netfilter: conntrack: collect all entries in one cycle")
> > > conntrack gc was changed to run periodically every 2 minutes.
> > >
> > > On systems where conntrack hash table is set to large value,
> > > almost all evictions happen from gc worker rather than the packet
> > > path due to hash table distribution.
> > >
> > > This causes netlink event overflows when the events are collected.
> >
> > If the issue is netlink, it should be possible to batch netlink
> > events.
> >
> > > This change exposes two sysctls:
> > >
> > > 1. gc interval (milliseconds, default: 2 minutes)
> > > 2. buckets per cycle (default: UINT_MAX / all)
> > >
> > > This allows to increase the scan intervals but also to reduce bustiness
> > > by switching to partial scans of the table for each cycle.
> >
> > Is there a way to apply autotuning? I know, this question might be
> > hard, but when does the user has update this new toggle? And do we
> > know what value should be placed here?
> >
> > @Eyal: What gc interval you selected for your setup to address this
> > issue? You mentioned a lot of UDP short-lived flows, correct?
>
> Yes, we have a lot of short lived UDP flows related to a DNS server service.
>
> We collect flow termination events using ulogd and forward them as JSON
> messages over UDP to fluentd. Since these flows are reaped every 2 minutes,
> we see spikes in UDP rx drops due to fluentd not keeping up with the bursts.
>
> We planned to configure this to run every 10s or so, which should be
> sufficient for our workloads, and monitor these spikes in order to tune
> further as needed.
>
> Eyal.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-23 13:30   ` Florian Westphal
@ 2021-11-30 21:31     ` Pablo Neira Ayuso
  2021-12-01 11:24       ` Florian Westphal
  0 siblings, 1 reply; 9+ messages in thread
From: Pablo Neira Ayuso @ 2021-11-30 21:31 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netfilter-devel, Karel Rericha, Shmulik Ladkani, Eyal Birger

[-- Attachment #1: Type: text/plain, Size: 2170 bytes --]

On Tue, Nov 23, 2021 at 02:30:45PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > Hi,
> > 
> > On Sun, Nov 21, 2021 at 06:05:14PM +0100, Florian Westphal wrote:
> > > as of commit 4608fdfc07e1
> > > ("netfilter: conntrack: collect all entries in one cycle")
> > > conntrack gc was changed to run periodically every 2 minutes.
> > > 
> > > On systems where conntrack hash table is set to large value,
> > > almost all evictions happen from gc worker rather than the packet
> > > path due to hash table distribution.
> > > 
> > > This causes netlink event overflows when the events are collected.
> > 
> > If the issue is netlink, it should be possible to batch netlink
> > events.
> 
> I do not see how.

I started a patchset, but the single hashtable for every netns might
defeat the batching, if there is a table per netns then it should be
similar to 67cc570edaa0.

But this would be a large patch though to address this issue, so see
below.

> > > 1. gc interval (milliseconds, default: 2 minutes)
> > > 2. buckets per cycle (default: UINT_MAX / all)
> > > 
> > > This allows to increase the scan intervals but also to reduce bustiness
> > > by switching to partial scans of the table for each cycle.
> > 
> > Is there a way to apply autotuning? I know, this question might be
> > hard, but when does the user has update this new toggle?
> 
> Whenever you need to timely delivery of events, or you need timely
> reaping of outdated entries.
> 
> And we can't increase scan frequency because that will cause
> more wakeups on otherwise idle systems, that was the entire reason
> for going with 2m.

Default 2m is probably too large? This should be set at least to the
UDP unreplied timeout, ie. 30s?

> > And do we know what value should be placed here?
> 
> I tried, did not work out (see history of gc worker).

Probably set default scan interval to 20s and reduce it if there is
workload coming in the next scan round? It is possible to count for
the number of entries that will expired in the next round, if this
represents a large % of entries, then reduce the scan interval of the
vgarbage collector.

I'm attaching a patch.

[-- Attachment #2: ct-gc-adapt.patch --]
[-- Type: text/x-diff, Size: 2950 bytes --]

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 770a63103c7a..3f6731a9c722 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -77,7 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
 /* serialize hash resizes and nf_ct_iterate_cleanup */
 static DEFINE_MUTEX(nf_conntrack_mutex);
 
-#define GC_SCAN_INTERVAL	(120u * HZ)
+/* Scan every 20 seconds which is 2/3 of the UDP unreplied timeout. */
+#define GC_SCAN_INTERVAL	(20u * HZ)
 #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
 
 #define MIN_CHAINLEN	8u
@@ -1418,12 +1419,22 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
+static bool nf_ct_is_expired_next_run(const struct nf_conn *ct)
+{
+	unsigned long next_timestamp = nfct_time_stamp + GC_SCAN_INTERVAL;
+
+	return (__s32)(ct->timeout - next_timestamp) <= 0;
+}
+
 static void gc_worker(struct work_struct *work)
 {
+	unsigned long next_run_expired_entries = 0, entries = 0, idle;
 	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
 	unsigned int i, hashsz, nf_conntrack_max95 = 0;
 	unsigned long next_run = GC_SCAN_INTERVAL;
 	struct conntrack_gc_work *gc_work;
+	bool next_run_expired;
+
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
@@ -1448,6 +1459,8 @@ static void gc_worker(struct work_struct *work)
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 
+			next_run_expired = false;
+			entries++;
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
 			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
@@ -1458,6 +1471,9 @@ static void gc_worker(struct work_struct *work)
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
 				continue;
+			} else if (nf_ct_is_expired_next_run(tmp)) {
+				next_run_expired = true;
+				next_run_expired_entries++;
 			}
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
@@ -1477,8 +1493,12 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
-			if (gc_worker_can_early_drop(tmp))
+			if (gc_worker_can_early_drop(tmp)) {
+				if (next_run_expired)
+					next_run_expired_entries--;
+
 				nf_ct_kill(tmp);
+			}
 
 			nf_ct_put(tmp);
 		}
@@ -1511,7 +1531,22 @@ static void gc_worker(struct work_struct *work)
 	if (next_run) {
 		gc_work->early_drop = false;
 		gc_work->next_bucket = 0;
+		/*
+		 * Calculate gc workload for the next run, adjust the gc
+		 * interval not to reap expired entries in bursts.
+		 *
+		 * Adjust scan interval linearly based on the percentage of
+		 * entries that will expire in the next run. The scan interval
+		 * is inversely proportional to the workload.
+		 */
+		if (entries == 0) {
+			next_run = GC_SCAN_INTERVAL;
+		} else {
+			idle = 100u - (next_run_expired_entries * 100u / entries);
+			next_run = GC_SCAN_INTERVAL * idle / 100u;
+		}
 	}
+
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-11-30 21:31     ` Pablo Neira Ayuso
@ 2021-12-01 11:24       ` Florian Westphal
  2021-12-14 10:37         ` Eyal Birger
  0 siblings, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2021-12-01 11:24 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, netfilter-devel, Karel Rericha,
	Shmulik Ladkani, Eyal Birger

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > I do not see how.
> 
> I started a patchset, but the single hashtable for every netns might
> defeat the batching, if there is a table per netns then it should be
> similar to 67cc570edaa0.

I see.

> > for going with 2m.
> 
> Default 2m is probably too large? This should be set at least to the
> UDP unreplied timeout, ie. 30s?

Perhaps but I don't think 30s is going to resolve the issue at hand
(burstiness).

> Probably set default scan interval to 20s and reduce it if there is
> workload coming in the next scan round? It is possible to count for
> the number of entries that will expired in the next round, if this
> represents a large % of entries, then reduce the scan interval of the
> vgarbage collector.

I don't see how thios helps, see below.

> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 770a63103c7a..3f6731a9c722 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -77,7 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
>  /* serialize hash resizes and nf_ct_iterate_cleanup */
>  static DEFINE_MUTEX(nf_conntrack_mutex);
>  
> -#define GC_SCAN_INTERVAL	(120u * HZ)
> +/* Scan every 20 seconds which is 2/3 of the UDP unreplied timeout. */
> +#define GC_SCAN_INTERVAL	(20u * HZ)
>  #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
>  
>  #define MIN_CHAINLEN	8u
> @@ -1418,12 +1419,22 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
>  	return false;
>  }
>  
> +static bool nf_ct_is_expired_next_run(const struct nf_conn *ct)
> +{
> +	unsigned long next_timestamp = nfct_time_stamp + GC_SCAN_INTERVAL;
> +
> +	return (__s32)(ct->timeout - next_timestamp) <= 0;
> +}

Ok.

>  static void gc_worker(struct work_struct *work)
>  {
> +	unsigned long next_run_expired_entries = 0, entries = 0, idle;
>  	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
>  	unsigned int i, hashsz, nf_conntrack_max95 = 0;
>  	unsigned long next_run = GC_SCAN_INTERVAL;
>  	struct conntrack_gc_work *gc_work;
> +	bool next_run_expired;
> +
>  	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
>  
>  	i = gc_work->next_bucket;
> @@ -1448,6 +1459,8 @@ static void gc_worker(struct work_struct *work)
>  			struct nf_conntrack_net *cnet;
>  			struct net *net;
>  
> +			next_run_expired = false;
> +			entries++;
>  			tmp = nf_ct_tuplehash_to_ctrack(h);
>  
>  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> @@ -1458,6 +1471,9 @@ static void gc_worker(struct work_struct *work)
>  			if (nf_ct_is_expired(tmp)) {
>  				nf_ct_gc_expired(tmp);
>  				continue;
> +			} else if (nf_ct_is_expired_next_run(tmp)) {
> +				next_run_expired = true;
> +				next_run_expired_entries++;

This means this expires within next 20s, but not now.

> @@ -1511,7 +1531,22 @@ static void gc_worker(struct work_struct *work)
>  	if (next_run) {
>  		gc_work->early_drop = false;
>  		gc_work->next_bucket = 0;
> +		/*
> +		 * Calculate gc workload for the next run, adjust the gc
> +		 * interval not to reap expired entries in bursts.
> +		 *
> +		 * Adjust scan interval linearly based on the percentage of
> +		 * entries that will expire in the next run. The scan interval
> +		 * is inversely proportional to the workload.
> +		 */
> +		if (entries == 0) {
> +			next_run = GC_SCAN_INTERVAL;
> +		} else {
> +			idle = 100u - (next_run_expired_entries * 100u / entries);
> +			next_run = GC_SCAN_INTERVAL * idle / 100u;

AFAICS we may now schedule next run for 'right now' even though that
would not find any expired entries (they might all have a timeout of
19s). Next round would reap no entries, then resched again immediately

(the nf_ct_is_expired_next_run expire count assumes next run is in
 20s, not before).

This would burn cycles for 19s before those entries can be expired.

Not sure how to best avoid this, perhaps computing the remaining avg timeout
of the nf_ct_is_expired_next_run() candidates would help?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-12-01 11:24       ` Florian Westphal
@ 2021-12-14 10:37         ` Eyal Birger
  2022-01-11 19:44           ` Florian Westphal
  0 siblings, 1 reply; 9+ messages in thread
From: Eyal Birger @ 2021-12-14 10:37 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, netfilter-devel, Karel Rericha, Shmulik Ladkani

On Wed, Dec 1, 2021 at 1:24 PM Florian Westphal <fw@strlen.de> wrote:
>
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > I do not see how.
> >
> > I started a patchset, but the single hashtable for every netns might
> > defeat the batching, if there is a table per netns then it should be
> > similar to 67cc570edaa0.
>
> I see.
>
> > > for going with 2m.
> >
> > Default 2m is probably too large? This should be set at least to the
> > UDP unreplied timeout, ie. 30s?
>
> Perhaps but I don't think 30s is going to resolve the issue at hand
> (burstiness).
>
> > Probably set default scan interval to 20s and reduce it if there is
> > workload coming in the next scan round? It is possible to count for
> > the number of entries that will expired in the next round, if this
> > represents a large % of entries, then reduce the scan interval of the
> > vgarbage collector.
>
> I don't see how thios helps, see below.
>
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 770a63103c7a..3f6731a9c722 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -77,7 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
> >  /* serialize hash resizes and nf_ct_iterate_cleanup */
> >  static DEFINE_MUTEX(nf_conntrack_mutex);
> >
> > -#define GC_SCAN_INTERVAL     (120u * HZ)
> > +/* Scan every 20 seconds which is 2/3 of the UDP unreplied timeout. */
> > +#define GC_SCAN_INTERVAL     (20u * HZ)
> >  #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
> >
> >  #define MIN_CHAINLEN 8u
> > @@ -1418,12 +1419,22 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
> >       return false;
> >  }
> >
> > +static bool nf_ct_is_expired_next_run(const struct nf_conn *ct)
> > +{
> > +     unsigned long next_timestamp = nfct_time_stamp + GC_SCAN_INTERVAL;
> > +
> > +     return (__s32)(ct->timeout - next_timestamp) <= 0;
> > +}
>
> Ok.
>
> >  static void gc_worker(struct work_struct *work)
> >  {
> > +     unsigned long next_run_expired_entries = 0, entries = 0, idle;
> >       unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
> >       unsigned int i, hashsz, nf_conntrack_max95 = 0;
> >       unsigned long next_run = GC_SCAN_INTERVAL;
> >       struct conntrack_gc_work *gc_work;
> > +     bool next_run_expired;
> > +
> >       gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
> >
> >       i = gc_work->next_bucket;
> > @@ -1448,6 +1459,8 @@ static void gc_worker(struct work_struct *work)
> >                       struct nf_conntrack_net *cnet;
> >                       struct net *net;
> >
> > +                     next_run_expired = false;
> > +                     entries++;
> >                       tmp = nf_ct_tuplehash_to_ctrack(h);
> >
> >                       if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > @@ -1458,6 +1471,9 @@ static void gc_worker(struct work_struct *work)
> >                       if (nf_ct_is_expired(tmp)) {
> >                               nf_ct_gc_expired(tmp);
> >                               continue;
> > +                     } else if (nf_ct_is_expired_next_run(tmp)) {
> > +                             next_run_expired = true;
> > +                             next_run_expired_entries++;
>
> This means this expires within next 20s, but not now.
>
> > @@ -1511,7 +1531,22 @@ static void gc_worker(struct work_struct *work)
> >       if (next_run) {
> >               gc_work->early_drop = false;
> >               gc_work->next_bucket = 0;
> > +             /*
> > +              * Calculate gc workload for the next run, adjust the gc
> > +              * interval not to reap expired entries in bursts.
> > +              *
> > +              * Adjust scan interval linearly based on the percentage of
> > +              * entries that will expire in the next run. The scan interval
> > +              * is inversely proportional to the workload.
> > +              */
> > +             if (entries == 0) {
> > +                     next_run = GC_SCAN_INTERVAL;
> > +             } else {
> > +                     idle = 100u - (next_run_expired_entries * 100u / entries);
> > +                     next_run = GC_SCAN_INTERVAL * idle / 100u;
>
> AFAICS we may now schedule next run for 'right now' even though that
> would not find any expired entries (they might all have a timeout of
> 19s). Next round would reap no entries, then resched again immediately
>
> (the nf_ct_is_expired_next_run expire count assumes next run is in
>  20s, not before).
>
> This would burn cycles for 19s before those entries can be expired.
>
> Not sure how to best avoid this, perhaps computing the remaining avg timeout
> of the nf_ct_is_expired_next_run() candidates would help?

At least for us - where our load is mostly constant - using an avg
seems like a good approach.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior
  2021-12-14 10:37         ` Eyal Birger
@ 2022-01-11 19:44           ` Florian Westphal
  0 siblings, 0 replies; 9+ messages in thread
From: Florian Westphal @ 2022-01-11 19:44 UTC (permalink / raw)
  To: Eyal Birger
  Cc: Florian Westphal, Pablo Neira Ayuso, netfilter-devel,
	Karel Rericha, Shmulik Ladkani

Eyal Birger <eyal.birger@gmail.com> wrote:

Hi Eyal,

> > AFAICS we may now schedule next run for 'right now' even though that
> > would not find any expired entries (they might all have a timeout of
> > 19s). Next round would reap no entries, then resched again immediately
> >
> > (the nf_ct_is_expired_next_run expire count assumes next run is in
> >  20s, not before).
> >
> > This would burn cycles for 19s before those entries can be expired.
> >
> > Not sure how to best avoid this, perhaps computing the remaining avg timeout
> > of the nf_ct_is_expired_next_run() candidates would help?
> 
> At least for us - where our load is mostly constant - using an avg
> seems like a good approach.

I gave avg a shot, here it is.
This restarts the gc worker whenever it ran for too long, just like
before, but this time there is also an inital limit on events/s
generated per cycle.

Next run is done based on the average (non-expired) timeouts.
There is an inital bias towards large values, this is to avoid a
immediate scan if there are just a few entries (users might have
lowered e.g. udp timeout to 1 second).

For Karels use case I fear that its not enough and sysctl patch
is still superior (provided default tuneables were changed).

Karel, Eyal, it would be nice if you could have a look or test this
is a more realistic scenario than my synflood one.

From 0ce5d16de93e0d33435734b3f267fd591572dcec Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 8 Nov 2021 12:32:03 +0100
Subject: [RFC] netfilter: conntrack: revisit gc autotuning

Please don't apply yet, contains a few stats+pr_debug that should
not be needed.

as of commit 4608fdfc07e1
("netfilter: conntrack: collect all entries in one cycle")
conntrack gc was changed to run every 2 minutes.

On systems where conntrack hash table is set to large value,
almost all evictions happen from gc worker rather than the packet
path due to hash table distribution.

This causes netlink event overflows when the events are collected.

This change collects average expiry of scanned entries and
reschedules to the average value, within 1 to 60 second interval.

To avoid event overflows, add upper threshold and ratelimit
to avoid this.  If more entries have to be evicted, reschedule
and restart 1 jiffy into the future.

Reported-by: Karel Rericha <karel@maxtel.cz>
Cc: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_core.c | 102 +++++++++++++++++++++++++-----
 1 file changed, 85 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 894a325d39f2..e7778218fa7c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -67,6 +67,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 struct conntrack_gc_work {
 	struct delayed_work	dwork;
 	u32			next_bucket;
+	u32			avg_timeout;
+	u32			start_time;
+	u32			resched;
+	u32			expired;
 	bool			exiting;
 	bool			early_drop;
 };
@@ -78,9 +82,24 @@ static __read_mostly bool nf_conntrack_locks_all;
 /* serialize hash resizes and nf_ct_iterate_cleanup */
 static DEFINE_MUTEX(nf_conntrack_mutex);
 
-#define GC_SCAN_INTERVAL	(120u * HZ)
+#define GC_SCAN_INTERVAL_MAX	(60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN	(1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP	(300ul * HZ)
+
+/* large initial bias so that we don't scan often just because we have
+ * three entries with a 1s timeout.
+ */
+#define GC_SCAN_INTERVAL_INIT	INT_MAX
+
 #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
 
+#define CT_SINGLE_EVENT_SIZE	164	/* estimated size in byte of single event */
+
+/* inital throttling, about 8mb/s of event data */
+#define GC_SCAN_EXPIRED_MAX	(8 * 1024 * 1024 / CT_SINGLE_EVENT_SIZE / HZ)
+
 #define MIN_CHAINLEN	8u
 #define MAX_CHAINLEN	(32u - MIN_CHAINLEN)
 
@@ -1421,16 +1440,30 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
 	unsigned int i, hashsz, nf_conntrack_max95 = 0;
-	unsigned long next_run = GC_SCAN_INTERVAL;
+	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
+	unsigned int expired_count = 0;
+	unsigned long next_run;
+	s32 delta_time;
+
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
 	if (gc_work->early_drop)
 		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
+	if (i == 0) {
+		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+		gc_work->start_time = start_time;
+		gc_work->resched = 0;
+		gc_work->expired = 0;
+	}
+
+	next_run = gc_work->avg_timeout;
+
+	end_time = start_time + GC_SCAN_MAX_DURATION;
+
 	do {
 		struct nf_conntrack_tuple_hash *h;
 		struct hlist_nulls_head *ct_hash;
@@ -1447,6 +1480,7 @@ static void gc_worker(struct work_struct *work)
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 			struct nf_conntrack_net *cnet;
+			unsigned long expires;
 			struct net *net;
 
 			tmp = nf_ct_tuplehash_to_ctrack(h);
@@ -1456,11 +1490,30 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
+			if (expired_count > GC_SCAN_EXPIRED_MAX) {
+				rcu_read_unlock();
+
+				gc_work->next_bucket = i;
+				gc_work->avg_timeout = next_run;
+				gc_work->resched++;
+
+				delta_time = nfct_time_stamp - gc_work->start_time;
+
+				next_run = delta_time < GC_SCAN_INTERVAL_MIN;
+				goto early_exit;
+			}
+
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
+				expired_count++;
+				gc_work->expired++;
 				continue;
 			}
 
+			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+			next_run += expires;
+			next_run /= 2u;
+
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
@@ -1478,8 +1531,11 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
-			if (gc_worker_can_early_drop(tmp))
+			if (gc_worker_can_early_drop(tmp)) {
 				nf_ct_kill(tmp);
+				expired_count++;
+				gc_work->expired++;
+			}
 
 			nf_ct_put(tmp);
 		}
@@ -1492,33 +1548,45 @@ static void gc_worker(struct work_struct *work)
 		cond_resched();
 		i++;
 
-		if (time_after(jiffies, end_time) && i < hashsz) {
+		delta_time = nfct_time_stamp - end_time;
+		if (delta_time > 0 && i < hashsz) {
+			gc_work->avg_timeout = next_run;
 			gc_work->next_bucket = i;
+			gc_work->resched++;
 			next_run = 0;
-			break;
+			goto early_exit;
 		}
 	} while (i < hashsz);
 
+	gc_work->next_bucket = 0;
+
+	next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
+
+	delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+	if (next_run > (unsigned long)delta_time)
+		next_run -= delta_time;
+	else
+		next_run = 1;
+
+early_exit:
 	if (gc_work->exiting)
 		return;
 
-	/*
-	 * Eviction will normally happen from the packet path, and not
-	 * from this gc worker.
-	 *
-	 * This worker is only here to reap expired entries when system went
-	 * idle after a busy period.
-	 */
-	if (next_run) {
+	cond_resched();
+
+	if (next_run)
 		gc_work->early_drop = false;
-		gc_work->next_bucket = 0;
-	}
+
+	gc_work->resched++;
+	if (next_run > 1)
+		pr_debug("next run in %u ms expired %u in %u ms, re-sched count %u\n", jiffies_to_msecs(next_run), gc_work->expired, jiffies_to_msecs(delta_time), gc_work->resched - 1);
+
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
-	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
+	INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
 	gc_work->exiting = false;
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-01-11 19:45 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-21 17:05 [PATCH nf-next] netfilter: conntrack: allow to tune gc behavior Florian Westphal
2021-11-23 13:24 ` Pablo Neira Ayuso
2021-11-23 13:30   ` Florian Westphal
2021-11-30 21:31     ` Pablo Neira Ayuso
2021-12-01 11:24       ` Florian Westphal
2021-12-14 10:37         ` Eyal Birger
2022-01-11 19:44           ` Florian Westphal
2021-11-23 14:01   ` Eyal Birger
2021-11-24  9:17     ` Karel Rericha

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.