All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf] netfilter: conntrack: collect all entries in one cycle
@ 2021-07-26 22:29 Florian Westphal
  2021-08-05 11:03 ` Pablo Neira Ayuso
  0 siblings, 1 reply; 3+ messages in thread
From: Florian Westphal @ 2021-07-26 22:29 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal, Michal Kubecek

Michal Kubecek reports that conntrack gc is responsible for frequent
wakeups (every 125ms) on idle systems.

On busy systems, timed out entries are evicted during lookup.
The gc worker is only needed to remove entries after system becomes idle
after a busy period.

To resolve this, always scan the entire table.
If the scan is taking too long, reschedule so other work_structs can run
and resume from next bucket.

After a completed scan, wait for 2 minutes before the next cycle.
Heuristics for faster re-schedule are removed.

GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow
tuning this as-needed or even turn the gc worker off.

Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_core.c | 67 +++++++++----------------------
 1 file changed, 19 insertions(+), 48 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5c03e5106751..3fdcf251ec1f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,22 +66,17 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 
 struct conntrack_gc_work {
 	struct delayed_work	dwork;
-	u32			last_bucket;
+	u32			next_bucket;
 	bool			exiting;
 	bool			early_drop;
-	long			next_gc_run;
 };
 
 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
 static __read_mostly bool nf_conntrack_locks_all;
 
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV	128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES	(16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO	50u
+#define GC_SCAN_INTERVAL	(120u * HZ)
+#define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
 
 static struct conntrack_gc_work conntrack_gc_work;
 
@@ -1363,17 +1358,13 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
-	unsigned int i, goal, buckets = 0, expired_count = 0;
-	unsigned int nf_conntrack_max95 = 0;
+	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
+	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned long next_run = GC_SCAN_INTERVAL;
 	struct conntrack_gc_work *gc_work;
-	unsigned int ratio, scanned = 0;
-	unsigned long next_run;
-
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
-	goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
-	i = gc_work->last_bucket;
+	i = gc_work->next_bucket;
 	if (gc_work->early_drop)
 		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
@@ -1381,15 +1372,13 @@ static void gc_worker(struct work_struct *work)
 		struct nf_conntrack_tuple_hash *h;
 		struct hlist_nulls_head *ct_hash;
 		struct hlist_nulls_node *n;
-		unsigned int hashsz;
 		struct nf_conn *tmp;
 
-		i++;
 		rcu_read_lock();
 
 		nf_conntrack_get_ht(&ct_hash, &hashsz);
 		if (i >= hashsz)
-			i = 0;
+			break;
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 			struct nf_conntrack_net *cnet;
@@ -1397,7 +1386,6 @@ static void gc_worker(struct work_struct *work)
 
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
-			scanned++;
 			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
 				nf_ct_offload_timeout(tmp);
 				continue;
@@ -1405,7 +1393,6 @@ static void gc_worker(struct work_struct *work)
 
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
-				expired_count++;
 				continue;
 			}
 
@@ -1438,7 +1425,14 @@ static void gc_worker(struct work_struct *work)
 		 */
 		rcu_read_unlock();
 		cond_resched();
-	} while (++buckets < goal);
+		i++;
+
+		if (time_after(jiffies, end_time) && i < hashsz) {
+			gc_work->next_bucket = i;
+			next_run = 0;
+			break;
+		}
+	} while (i < hashsz);
 
 	if (gc_work->exiting)
 		return;
@@ -1449,40 +1443,17 @@ static void gc_worker(struct work_struct *work)
 	 *
 	 * This worker is only here to reap expired entries when system went
 	 * idle after a busy period.
-	 *
-	 * The heuristics below are supposed to balance conflicting goals:
-	 *
-	 * 1. Minimize time until we notice a stale entry
-	 * 2. Maximize scan intervals to not waste cycles
-	 *
-	 * Normally, expire ratio will be close to 0.
-	 *
-	 * As soon as a sizeable fraction of the entries have expired
-	 * increase scan frequency.
 	 */
-	ratio = scanned ? expired_count * 100 / scanned : 0;
-	if (ratio > GC_EVICT_RATIO) {
-		gc_work->next_gc_run = min_interval;
-	} else {
-		unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
-
-		BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
-
-		gc_work->next_gc_run += min_interval;
-		if (gc_work->next_gc_run > max)
-			gc_work->next_gc_run = max;
+	if (next_run) {
+		gc_work->early_drop = false;
+		gc_work->next_bucket = 0;
 	}
-
-	next_run = gc_work->next_gc_run;
-	gc_work->last_bucket = i;
-	gc_work->early_drop = false;
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
 	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
-	gc_work->next_gc_run = HZ;
 	gc_work->exiting = false;
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH nf] netfilter: conntrack: collect all entries in one cycle
  2021-07-26 22:29 [PATCH nf] netfilter: conntrack: collect all entries in one cycle Florian Westphal
@ 2021-08-05 11:03 ` Pablo Neira Ayuso
  0 siblings, 0 replies; 3+ messages in thread
From: Pablo Neira Ayuso @ 2021-08-05 11:03 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel, Michal Kubecek

On Tue, Jul 27, 2021 at 12:29:19AM +0200, Florian Westphal wrote:
> Michal Kubecek reports that conntrack gc is responsible for frequent
> wakeups (every 125ms) on idle systems.
> 
> On busy systems, timed out entries are evicted during lookup.
> The gc worker is only needed to remove entries after system becomes idle
> after a busy period.
> 
> To resolve this, always scan the entire table.
> If the scan is taking too long, reschedule so other work_structs can run
> and resume from next bucket.
> 
> After a completed scan, wait for 2 minutes before the next cycle.
> Heuristics for faster re-schedule are removed.
> 
> GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow
> tuning this as-needed or even turn the gc worker off.

Applied, thanks.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH nf] netfilter: conntrack: collect all entries in one cycle
@ 2021-07-27  2:49 kernel test robot
  0 siblings, 0 replies; 3+ messages in thread
From: kernel test robot @ 2021-07-27  2:49 UTC (permalink / raw)
  To: kbuild

[-- Attachment #1: Type: text/plain, Size: 10834 bytes --]

CC: kbuild-all(a)lists.01.org
In-Reply-To: <20210726222919.5659-1-fw@strlen.de>
References: <20210726222919.5659-1-fw@strlen.de>
TO: Florian Westphal <fw@strlen.de>
TO: netfilter-devel(a)vger.kernel.org
CC: Florian Westphal <fw@strlen.de>
CC: Michal Kubecek <mkubecek@suse.cz>

Hi Florian,

I love your patch! Perhaps something to improve:

[auto build test WARNING on nf/master]

url:    https://github.com/0day-ci/linux/commits/Florian-Westphal/netfilter-conntrack-collect-all-entries-in-one-cycle/20210727-063110
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git master
:::::: branch date: 4 hours ago
:::::: commit date: 4 hours ago
config: mips-randconfig-s031-20210726 (attached as .config)
compiler: mips-linux-gcc (GCC) 10.3.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.3-341-g8af24329-dirty
        # https://github.com/0day-ci/linux/commit/874bdb31d990217da8f9db6350f4bec782ea2146
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Florian-Westphal/netfilter-conntrack-collect-all-entries-in-one-cycle/20210727-063110
        git checkout 874bdb31d990217da8f9db6350f4bec782ea2146
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=mips SHELL=/bin/bash net/netfilter/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
   command-line: note: in included file:
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_ACQUIRE redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_SEQ_CST redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_ACQ_REL redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_RELEASE redefined
   builtin:0:0: sparse: this was the original definition
   net/netfilter/nf_conntrack_core.c:2384:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   net/netfilter/nf_conntrack_core.c:2384:9: sparse:    void ( [noderef] __rcu * )( ... )
   net/netfilter/nf_conntrack_core.c:2384:9: sparse:    void ( * )( ... )
   net/netfilter/nf_conntrack_core.c:2703:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   net/netfilter/nf_conntrack_core.c:2703:9: sparse:    void ( [noderef] __rcu * )( ... )
   net/netfilter/nf_conntrack_core.c:2703:9: sparse:    void ( * )( ... )
   net/netfilter/nf_conntrack_core.c:108:13: sparse: sparse: context imbalance in 'nf_conntrack_double_unlock' - unexpected unlock
   net/netfilter/nf_conntrack_core.c:118:13: sparse: sparse: context imbalance in 'nf_conntrack_double_lock' - wrong count at exit
>> net/netfilter/nf_conntrack_core.c:1359:13: sparse: sparse: context imbalance in 'gc_worker' - wrong count at exit
   net/netfilter/nf_conntrack_core.c:2216:28: sparse: sparse: context imbalance in 'get_next_corpse' - unexpected unlock

vim +/gc_worker +1359 net/netfilter/nf_conntrack_core.c

c6dd940b1f747b Florian Westphal  2017-04-16  1358  
b87a2f9199ea82 Florian Westphal  2016-08-25 @1359  static void gc_worker(struct work_struct *work)
b87a2f9199ea82 Florian Westphal  2016-08-25  1360  {
874bdb31d99021 Florian Westphal  2021-07-27  1361  	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
874bdb31d99021 Florian Westphal  2021-07-27  1362  	unsigned int i, hashsz, nf_conntrack_max95 = 0;
874bdb31d99021 Florian Westphal  2021-07-27  1363  	unsigned long next_run = GC_SCAN_INTERVAL;
b87a2f9199ea82 Florian Westphal  2016-08-25  1364  	struct conntrack_gc_work *gc_work;
b87a2f9199ea82 Florian Westphal  2016-08-25  1365  	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
b87a2f9199ea82 Florian Westphal  2016-08-25  1366  
874bdb31d99021 Florian Westphal  2021-07-27  1367  	i = gc_work->next_bucket;
c6dd940b1f747b Florian Westphal  2017-04-16  1368  	if (gc_work->early_drop)
c6dd940b1f747b Florian Westphal  2017-04-16  1369  		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
b87a2f9199ea82 Florian Westphal  2016-08-25  1370  
b87a2f9199ea82 Florian Westphal  2016-08-25  1371  	do {
b87a2f9199ea82 Florian Westphal  2016-08-25  1372  		struct nf_conntrack_tuple_hash *h;
b87a2f9199ea82 Florian Westphal  2016-08-25  1373  		struct hlist_nulls_head *ct_hash;
b87a2f9199ea82 Florian Westphal  2016-08-25  1374  		struct hlist_nulls_node *n;
b87a2f9199ea82 Florian Westphal  2016-08-25  1375  		struct nf_conn *tmp;
b87a2f9199ea82 Florian Westphal  2016-08-25  1376  
b87a2f9199ea82 Florian Westphal  2016-08-25  1377  		rcu_read_lock();
b87a2f9199ea82 Florian Westphal  2016-08-25  1378  
b87a2f9199ea82 Florian Westphal  2016-08-25  1379  		nf_conntrack_get_ht(&ct_hash, &hashsz);
b87a2f9199ea82 Florian Westphal  2016-08-25  1380  		if (i >= hashsz)
874bdb31d99021 Florian Westphal  2021-07-27  1381  			break;
b87a2f9199ea82 Florian Westphal  2016-08-25  1382  
b87a2f9199ea82 Florian Westphal  2016-08-25  1383  		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
c53bd0e96662c2 Florian Westphal  2021-04-12  1384  			struct nf_conntrack_net *cnet;
c6dd940b1f747b Florian Westphal  2017-04-16  1385  			struct net *net;
c6dd940b1f747b Florian Westphal  2017-04-16  1386  
b87a2f9199ea82 Florian Westphal  2016-08-25  1387  			tmp = nf_ct_tuplehash_to_ctrack(h);
b87a2f9199ea82 Florian Westphal  2016-08-25  1388  
90964016e5d347 Pablo Neira Ayuso 2018-01-07  1389  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
90964016e5d347 Pablo Neira Ayuso 2018-01-07  1390  				nf_ct_offload_timeout(tmp);
90964016e5d347 Pablo Neira Ayuso 2018-01-07  1391  				continue;
90964016e5d347 Pablo Neira Ayuso 2018-01-07  1392  			}
90964016e5d347 Pablo Neira Ayuso 2018-01-07  1393  
b87a2f9199ea82 Florian Westphal  2016-08-25  1394  			if (nf_ct_is_expired(tmp)) {
b87a2f9199ea82 Florian Westphal  2016-08-25  1395  				nf_ct_gc_expired(tmp);
b87a2f9199ea82 Florian Westphal  2016-08-25  1396  				continue;
b87a2f9199ea82 Florian Westphal  2016-08-25  1397  			}
c6dd940b1f747b Florian Westphal  2017-04-16  1398  
c6dd940b1f747b Florian Westphal  2017-04-16  1399  			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
c6dd940b1f747b Florian Westphal  2017-04-16  1400  				continue;
c6dd940b1f747b Florian Westphal  2017-04-16  1401  
c6dd940b1f747b Florian Westphal  2017-04-16  1402  			net = nf_ct_net(tmp);
0418b989a46788 Pablo Neira Ayuso 2021-06-02  1403  			cnet = nf_ct_pernet(net);
c53bd0e96662c2 Florian Westphal  2021-04-12  1404  			if (atomic_read(&cnet->count) < nf_conntrack_max95)
c6dd940b1f747b Florian Westphal  2017-04-16  1405  				continue;
c6dd940b1f747b Florian Westphal  2017-04-16  1406  
c6dd940b1f747b Florian Westphal  2017-04-16  1407  			/* need to take reference to avoid possible races */
c6dd940b1f747b Florian Westphal  2017-04-16  1408  			if (!atomic_inc_not_zero(&tmp->ct_general.use))
c6dd940b1f747b Florian Westphal  2017-04-16  1409  				continue;
c6dd940b1f747b Florian Westphal  2017-04-16  1410  
c6dd940b1f747b Florian Westphal  2017-04-16  1411  			if (gc_worker_skip_ct(tmp)) {
c6dd940b1f747b Florian Westphal  2017-04-16  1412  				nf_ct_put(tmp);
c6dd940b1f747b Florian Westphal  2017-04-16  1413  				continue;
c6dd940b1f747b Florian Westphal  2017-04-16  1414  			}
c6dd940b1f747b Florian Westphal  2017-04-16  1415  
c6dd940b1f747b Florian Westphal  2017-04-16  1416  			if (gc_worker_can_early_drop(tmp))
c6dd940b1f747b Florian Westphal  2017-04-16  1417  				nf_ct_kill(tmp);
c6dd940b1f747b Florian Westphal  2017-04-16  1418  
c6dd940b1f747b Florian Westphal  2017-04-16  1419  			nf_ct_put(tmp);
b87a2f9199ea82 Florian Westphal  2016-08-25  1420  		}
b87a2f9199ea82 Florian Westphal  2016-08-25  1421  
b87a2f9199ea82 Florian Westphal  2016-08-25  1422  		/* could check get_nulls_value() here and restart if ct
b87a2f9199ea82 Florian Westphal  2016-08-25  1423  		 * was moved to another chain.  But given gc is best-effort
b87a2f9199ea82 Florian Westphal  2016-08-25  1424  		 * we will just continue with next hash slot.
b87a2f9199ea82 Florian Westphal  2016-08-25  1425  		 */
b87a2f9199ea82 Florian Westphal  2016-08-25  1426  		rcu_read_unlock();
ffa53c5863ddb2 Paul E. McKenney  2017-10-24  1427  		cond_resched();
874bdb31d99021 Florian Westphal  2021-07-27  1428  		i++;
874bdb31d99021 Florian Westphal  2021-07-27  1429  
874bdb31d99021 Florian Westphal  2021-07-27  1430  		if (time_after(jiffies, end_time) && i < hashsz) {
874bdb31d99021 Florian Westphal  2021-07-27  1431  			gc_work->next_bucket = i;
874bdb31d99021 Florian Westphal  2021-07-27  1432  			next_run = 0;
874bdb31d99021 Florian Westphal  2021-07-27  1433  			break;
874bdb31d99021 Florian Westphal  2021-07-27  1434  		}
874bdb31d99021 Florian Westphal  2021-07-27  1435  	} while (i < hashsz);
b87a2f9199ea82 Florian Westphal  2016-08-25  1436  
b87a2f9199ea82 Florian Westphal  2016-08-25  1437  	if (gc_work->exiting)
b87a2f9199ea82 Florian Westphal  2016-08-25  1438  		return;
b87a2f9199ea82 Florian Westphal  2016-08-25  1439  
e0df8cae6c16b9 Florian Westphal  2016-11-04  1440  	/*
e0df8cae6c16b9 Florian Westphal  2016-11-04  1441  	 * Eviction will normally happen from the packet path, and not
e0df8cae6c16b9 Florian Westphal  2016-11-04  1442  	 * from this gc worker.
e0df8cae6c16b9 Florian Westphal  2016-11-04  1443  	 *
e0df8cae6c16b9 Florian Westphal  2016-11-04  1444  	 * This worker is only here to reap expired entries when system went
e0df8cae6c16b9 Florian Westphal  2016-11-04  1445  	 * idle after a busy period.
e0df8cae6c16b9 Florian Westphal  2016-11-04  1446  	 */
874bdb31d99021 Florian Westphal  2021-07-27  1447  	if (next_run) {
c6dd940b1f747b Florian Westphal  2017-04-16  1448  		gc_work->early_drop = false;
874bdb31d99021 Florian Westphal  2021-07-27  1449  		gc_work->next_bucket = 0;
874bdb31d99021 Florian Westphal  2021-07-27  1450  	}
0984d427c1d3cb Vincent Guittot   2017-11-02  1451  	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
b87a2f9199ea82 Florian Westphal  2016-08-25  1452  }
b87a2f9199ea82 Florian Westphal  2016-08-25  1453  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 36274 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-08-05 11:03 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-26 22:29 [PATCH nf] netfilter: conntrack: collect all entries in one cycle Florian Westphal
2021-08-05 11:03 ` Pablo Neira Ayuso
2021-07-27  2:49 kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.