All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Michal Kubecek <mkubecek@suse.cz>,
	Florian Westphal <fw@strlen.de>,
	Pablo Neira Ayuso <pablo@netfilter.org>,
	Sasha Levin <sashal@kernel.org>
Subject: [PATCH 4.19 06/33] netfilter: conntrack: collect all entries in one cycle
Date: Wed,  1 Sep 2021 14:27:55 +0200	[thread overview]
Message-ID: <20210901122250.993208505@linuxfoundation.org> (raw)
In-Reply-To: <20210901122250.752620302@linuxfoundation.org>

From: Florian Westphal <fw@strlen.de>

[ Upstream commit 4608fdfc07e116f9fc0895beb40abad7cdb5ee3d ]

Michal Kubecek reports that conntrack gc is responsible for frequent
wakeups (every 125ms) on idle systems.

On busy systems, timed out entries are evicted during lookup.
The gc worker is only needed to remove entries after system becomes idle
after a busy period.

To resolve this, always scan the entire table.
If the scan is taking too long, reschedule so other work_structs can run
and resume from next bucket.

After a completed scan, wait for 2 minutes before the next cycle.
Heuristics for faster re-schedule are removed.

GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow
tuning this as-needed or even turn the gc worker off.

Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/netfilter/nf_conntrack_core.c | 71 ++++++++++---------------------
 1 file changed, 22 insertions(+), 49 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c5590d36b775..a38caf317dbb 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -70,10 +70,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 
 struct conntrack_gc_work {
 	struct delayed_work	dwork;
-	u32			last_bucket;
+	u32			next_bucket;
 	bool			exiting;
 	bool			early_drop;
-	long			next_gc_run;
 };
 
 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
@@ -81,12 +80,8 @@ static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
 static __read_mostly bool nf_conntrack_locks_all;
 
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV	128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES	(16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO	50u
+#define GC_SCAN_INTERVAL	(120u * HZ)
+#define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
 
 static struct conntrack_gc_work conntrack_gc_work;
 
@@ -1198,17 +1193,13 @@ static void nf_ct_offload_timeout(struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
-	unsigned int i, goal, buckets = 0, expired_count = 0;
-	unsigned int nf_conntrack_max95 = 0;
+	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
+	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned long next_run = GC_SCAN_INTERVAL;
 	struct conntrack_gc_work *gc_work;
-	unsigned int ratio, scanned = 0;
-	unsigned long next_run;
-
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
-	goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
-	i = gc_work->last_bucket;
+	i = gc_work->next_bucket;
 	if (gc_work->early_drop)
 		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
@@ -1216,22 +1207,21 @@ static void gc_worker(struct work_struct *work)
 		struct nf_conntrack_tuple_hash *h;
 		struct hlist_nulls_head *ct_hash;
 		struct hlist_nulls_node *n;
-		unsigned int hashsz;
 		struct nf_conn *tmp;
 
-		i++;
 		rcu_read_lock();
 
 		nf_conntrack_get_ht(&ct_hash, &hashsz);
-		if (i >= hashsz)
-			i = 0;
+		if (i >= hashsz) {
+			rcu_read_unlock();
+			break;
+		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 			struct net *net;
 
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
-			scanned++;
 			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
 				nf_ct_offload_timeout(tmp);
 				continue;
@@ -1239,7 +1229,6 @@ static void gc_worker(struct work_struct *work)
 
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
-				expired_count++;
 				continue;
 			}
 
@@ -1271,7 +1260,14 @@ static void gc_worker(struct work_struct *work)
 		 */
 		rcu_read_unlock();
 		cond_resched();
-	} while (++buckets < goal);
+		i++;
+
+		if (time_after(jiffies, end_time) && i < hashsz) {
+			gc_work->next_bucket = i;
+			next_run = 0;
+			break;
+		}
+	} while (i < hashsz);
 
 	if (gc_work->exiting)
 		return;
@@ -1282,40 +1278,17 @@ static void gc_worker(struct work_struct *work)
 	 *
 	 * This worker is only here to reap expired entries when system went
 	 * idle after a busy period.
-	 *
-	 * The heuristics below are supposed to balance conflicting goals:
-	 *
-	 * 1. Minimize time until we notice a stale entry
-	 * 2. Maximize scan intervals to not waste cycles
-	 *
-	 * Normally, expire ratio will be close to 0.
-	 *
-	 * As soon as a sizeable fraction of the entries have expired
-	 * increase scan frequency.
 	 */
-	ratio = scanned ? expired_count * 100 / scanned : 0;
-	if (ratio > GC_EVICT_RATIO) {
-		gc_work->next_gc_run = min_interval;
-	} else {
-		unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
-
-		BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
-
-		gc_work->next_gc_run += min_interval;
-		if (gc_work->next_gc_run > max)
-			gc_work->next_gc_run = max;
+	if (next_run) {
+		gc_work->early_drop = false;
+		gc_work->next_bucket = 0;
 	}
-
-	next_run = gc_work->next_gc_run;
-	gc_work->last_bucket = i;
-	gc_work->early_drop = false;
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
 	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
-	gc_work->next_gc_run = HZ;
 	gc_work->exiting = false;
 }
 
-- 
2.30.2




  parent reply	other threads:[~2021-09-01 12:30 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-01 12:27 [PATCH 4.19 00/33] 4.19.206-rc1 review Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 01/33] net: qrtr: fix another OOB Read in qrtr_endpoint_post Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 02/33] bpf: Do not use ax register in interpreter on div/mod Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 03/33] bpf: Fix 32 bit src register truncation " Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 04/33] bpf: Fix truncation handling for mod32 dst reg wrt zero Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 05/33] ARC: Fix CONFIG_STACKDEPOT Greg Kroah-Hartman
2021-09-01 12:27 ` Greg Kroah-Hartman [this message]
2021-09-01 12:27 ` [PATCH 4.19 07/33] once: Fix panic when module unload Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 08/33] can: usb: esd_usb2: esd_usb2_rx_event(): fix the interchange of the CAN RX and TX error counters Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 09/33] Revert "USB: serial: ch341: fix character loss at high transfer rates" Greg Kroah-Hartman
2021-09-01 12:27 ` [PATCH 4.19 10/33] USB: serial: option: add new VID/PID to support Fibocom FG150 Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 11/33] usb: dwc3: gadget: Fix dwc3_calc_trbs_left() Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 12/33] usb: dwc3: gadget: Stop EP0 transfers during pullup disable Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 13/33] IB/hfi1: Fix possible null-pointer dereference in _extend_sdma_tx_descs() Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 14/33] e1000e: Fix the max snoop/no-snoop latency for 10M Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 15/33] ip_gre: add validation for csum_start Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 16/33] xgene-v2: Fix a resource leak in the error handling path of xge_probe() Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 17/33] net: marvell: fix MVNETA_TX_IN_PRGRS bit number Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 18/33] net: hns3: fix get wrong pfc_en when query PFC configuration Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 19/33] usb: gadget: u_audio: fix race condition on endpoint stop Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 20/33] opp: remove WARN when no valid OPPs remain Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 21/33] virtio: Improve vq->broken access to avoid any compiler optimization Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 22/33] virtio_pci: Support surprise removal of virtio pci device Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 23/33] vringh: Use wiov->used to check for read/write desc order Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 24/33] qed: qed ll2 race condition fixes Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 25/33] qed: Fix null-pointer dereference in qed_rdma_create_qp() Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 26/33] drm: Copy drm_wait_vblank to user before returning Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 27/33] drm/nouveau/disp: power down unused DP links during init Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 28/33] net/rds: dma_map_sg is entitled to merge entries Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 29/33] vt_kdsetmode: extend console locking Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 30/33] fbmem: add margin check to fb_check_caps() Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 31/33] KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 32/33] Revert "floppy: reintroduce O_NDELAY fix" Greg Kroah-Hartman
2021-09-01 12:28 ` [PATCH 4.19 33/33] net: dont unconditionally copy_from_user a struct ifreq for socket ioctls Greg Kroah-Hartman
2021-09-01 19:23 ` [PATCH 4.19 00/33] 4.19.206-rc1 review Jon Hunter
2021-09-01 20:07 ` Pavel Machek
2021-09-01 21:23 ` Shuah Khan
2021-09-02  1:09 ` Samuel Zou
2021-09-02 11:52 ` Sudip Mukherjee
2021-09-02 14:47 ` Naresh Kamboju
2021-09-02 21:50 ` Guenter Roeck

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210901122250.993208505@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=fw@strlen.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mkubecek@suse.cz \
    --cc=pablo@netfilter.org \
    --cc=sashal@kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.