linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: torvalds@linux-foundation.org, akpm@linux-foundation.org,
	alan@lxorguk.ukuu.org.uk, Eric Dumazet <eric.dumazet@gmail.com>,
	"David S. Miller" <davem@davemloft.net>
Subject: [60/67] ipv4: reintroduce route cache garbage collector
Date: Tue, 03 Jan 2012 14:30:18 -0800	[thread overview]
Message-ID: <20120103223036.382195374@clark.kroah.org> (raw)
In-Reply-To: <20120103223043.GA26738@kroah.com>

3.0-stable review patch.  If anyone has any objections, please let me know.

------------------


From: Eric Dumazet <eric.dumazet@gmail.com>

[ Upstream commit 9f28a2fc0bd77511f649c0a788c7bf9a5fd04edb ]

Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer)
removed IP route cache garbage collector a bit too soon, as this gc was
responsible for expired routes cleanup, releasing their neighbour
reference.

As pointed out by Robert Gladewitz, recent kernels can fill and exhaust
their neighbour cache.

Reintroduce the garbage collection, since we'll have to wait our
neighbour lookups become refcount-less to not depend on this stuff.

Reported-by: Robert Gladewitz <gladewitz@gmx.de>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/route.c |  106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,6 +132,9 @@ static int ip_rt_min_pmtu __read_mostly
 static int ip_rt_min_advmss __read_mostly	= 256;
 static int rt_chain_length_max __read_mostly	= 20;
 
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
 /*
  *	Interface to generic destination cache.
  */
@@ -821,6 +824,97 @@ static int has_noalias(const struct rtab
 	return ONE;
 }
 
+static void rt_check_expire(void)
+{
+	static unsigned int rover;
+	unsigned int i = rover, goal;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
+	unsigned long samples = 0;
+	unsigned long sum = 0, sum2 = 0;
+	unsigned long delta;
+	u64 mult;
+
+	delta = jiffies - expires_ljiffies;
+	expires_ljiffies = jiffies;
+	mult = ((u64)delta) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask)
+		goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
+		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
+
+		i = (i + 1) & rt_hash_mask;
+		rthp = &rt_hash_table[i].chain;
+
+		if (need_resched())
+			cond_resched();
+
+		samples++;
+
+		if (rcu_dereference_raw(*rthp) == NULL)
+			continue;
+		length = 0;
+		spin_lock_bh(rt_hash_lock_addr(i));
+		while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+			prefetch(rth->dst.rt_next);
+			if (rt_is_expired(rth)) {
+				*rthp = rth->dst.rt_next;
+				rt_free(rth);
+				continue;
+			}
+			if (rth->dst.expires) {
+				/* Entry is expired even if it is in use */
+				if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+					tmo >>= 1;
+					rthp = &rth->dst.rt_next;
+					/*
+					 * We only count entries on
+					 * a chain with equal hash inputs once
+					 * so that entries for different QOS
+					 * levels, and other non-hash input
+					 * attributes don't unfairly skew
+					 * the length computation
+					 */
+					length += has_noalias(rt_hash_table[i].chain, rth);
+					continue;
+				}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
+
+			/* Cleanup aged off entries. */
+			*rthp = rth->dst.rt_next;
+			rt_free(rth);
+		}
+		spin_unlock_bh(rt_hash_lock_addr(i));
+		sum += length;
+		sum2 += length*length;
+	}
+	if (samples) {
+		unsigned long avg = sum / samples;
+		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+		rt_chain_length_max = max_t(unsigned long,
+					ip_rt_gc_elasticity,
+					(avg + 4*sd) >> FRACT_BITS);
+	}
+	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	rt_check_expire();
+	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -3088,6 +3182,13 @@ static ctl_table ipv4_route_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
 		.maxlen		= sizeof(int),
@@ -3297,6 +3398,11 @@ int __init ip_rt_init(void)
 	devinet_init();
 	ip_fib_init();
 
+	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+	expires_ljiffies = jiffies;
+	schedule_delayed_work(&expires_work,
+		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
 	if (ip_rt_proc_init())
 		printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM



  parent reply	other threads:[~2012-01-03 22:37 UTC|newest]

Thread overview: 82+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-01-03 22:30 [00/67] 3.0.16-stable review Greg KH
2012-01-03 22:29 ` [01/67] ARM: OMAP: rx51: fix USB Greg KH
2012-01-03 22:29 ` [02/67] ipip, sit: copy parms.name after register_netdevice Greg KH
2012-01-03 22:29 ` [03/67] rtc: Expire alarms after the time is set Greg KH
2012-01-03 22:46   ` John Stultz
2012-01-03 22:47     ` Greg KH
2012-01-03 23:11       ` John Stultz
2012-01-03 22:29 ` [04/67] rtc: m41t80: Workaround broken alarm functionality Greg KH
2012-01-03 22:29 ` [05/67] drm/i915: prevent division by zero when asking for chipset power Greg KH
2012-01-03 22:29 ` [06/67] cfq-iosched: free cic_index if blkio_alloc_blkg_stats fails Greg KH
2012-01-03 22:29 ` [07/67] cfq-iosched: fix cfq_cic_link() race confition Greg KH
2012-01-03 22:29 ` [08/67] SCSI: zfcp: return early from slave_destroy if slave_alloc returned early Greg KH
2012-01-03 22:29 ` [09/67] SCSI: mpt2sas: _scsih_smart_predicted_fault uses GFP_KERNEL in interrupt context Greg KH
2012-01-03 22:29 ` [10/67] SCSI: fcoe: Fix preempt count leak in fcoe_filter_frames() Greg KH
2012-01-03 22:29 ` [11/67] mac80211: fix another race in aggregation start Greg KH
2012-01-03 22:29 ` [12/67] block: initialize request_queues numa node during Greg KH
2012-01-03 22:29 ` [13/67] ssb: fix init regression with SoCs Greg KH
2012-01-03 22:29 ` [14/67] MXC PWM: should active during DOZE/WAIT/DBG mode Greg KH
2012-01-03 22:29 ` [15/67] Input: synaptics - fix touchpad not working after S2R on Vostro V13 Greg KH
2012-01-03 22:29 ` [16/67] percpu: fix per_cpu_ptr_to_phys() handling of non-page-aligned addresses Greg KH
2012-01-03 22:29 ` [17/67] binary_sysctl(): fix memory leak Greg KH
2012-01-03 22:29 ` [18/67] oom: fix integer overflow of points in oom_badness Greg KH
2012-01-03 22:29 ` [19/67] oprofile: Fix uninitialized memory access when writing to writing to oprofilefs Greg KH
2012-01-03 22:29 ` [20/67] NFSv4.1: Ensure that we handle _all_ SEQUENCE status bits Greg KH
2012-01-03 22:29 ` [21/67] SELinux: Fix RCU deref check warning in sel_netport_insert() Greg KH
2012-01-03 22:29 ` [22/67] nilfs2: unbreak compat ioctl Greg KH
2012-01-03 22:29 ` [23/67] mmc: vub300: fix type of firmware_rom_wait_states module parameter Greg KH
2012-01-03 22:29 ` [24/67] cgroups: fix a css_set not found bug in cgroup_attach_proc Greg KH
2012-01-03 22:29 ` [25/67] mfd: Fix twl-core oops while calling twl_i2c_* for unbound driver Greg KH
2012-01-03 22:29 ` [26/67] vfs: __read_cache_page should use gfp argument rather than GFP_KERNEL Greg KH
2012-01-03 22:29 ` [27/67] media: s5p-fimc: Use correct fourcc for RGB565 colour format Greg KH
2012-01-03 22:29 ` [28/67] ath9k: fix max phy rate at rate control init Greg KH
2012-01-03 22:29 ` [29/67] iwlwifi: do not set the sequence control bit is not needed Greg KH
2012-01-03 22:29 ` [30/67] iwlwifi: allow to switch to HT40 if not associated Greg KH
2012-01-03 22:29 ` [31/67] memcg: keep root group unchanged if creation fails Greg KH
2012-01-03 22:29 ` [32/67] VFS: Fix race between CPU hotplug and lglocks Greg KH
2012-01-03 22:29 ` [33/67] ARM:imx:fix pwm period value Greg KH
2012-01-03 22:29 ` [34/67] ARM: 7214/1: mmc: mmci: Fixup handling of MCI_STARTBITERR Greg KH
2012-01-03 22:29 ` [35/67] ARM: 7220/1: mmc: mmci: Fixup error handling for dma Greg KH
2012-01-03 22:29 ` [36/67] oprofile, arm/sh: Fix oprofile_arch_exit() linkage issue Greg KH
2012-01-03 22:29 ` [37/67] futex: Fix uninterruptible loop due to gate_area Greg KH
2012-01-03 22:29 ` [38/67] watchdog: hpwdt: Changes to handle NX secure bit in 32bit path Greg KH
2012-01-03 22:29 ` [39/67] drm/radeon/kms: bail on BTC parts if MC ucode is missing Greg KH
2012-01-03 22:29 ` [40/67] mm: hugetlb: fix non-atomic enqueue of huge page Greg KH
2012-01-03 22:29 ` [41/67] mpt2sas crashes on shutdown Greg KH
2012-01-03 22:30 ` [42/67] sparc64: Fix MSIQ HV call ordering in pci_sun4v_msiq_build_irq() Greg KH
2012-01-03 22:30 ` [43/67] sparc32: Be less strict in matching %lo part of relocation Greg KH
2012-01-03 22:30 ` [44/67] sparc64: Patch sun4v code sequences properly on module load Greg KH
2012-01-03 22:30 ` [45/67] sparc: Kill custom io_remap_pfn_range() Greg KH
2012-01-03 22:30 ` [46/67] sparc32: Remove non-kernel code from memcpy implementation Greg KH
2012-01-03 22:30 ` [47/67] sparc32: Remove uses of %g7 in " Greg KH
2012-01-03 22:30 ` [48/67] sparc32: Correct the return value of memcpy Greg KH
2012-01-03 22:30 ` [49/67] sparc64: Fix masking and shifting in VIS fpcmp emulation Greg KH
2012-01-03 22:30 ` [50/67] sparc: Fix handling of orig_i0 wrt. debugging when restarting syscalls Greg KH
2012-01-03 22:30 ` [51/67] net: bpf_jit: fix an off-one bug in x86_64 cond jump target Greg KH
2012-01-03 22:30 ` [52/67] ppp: fix pptp double release_sock in pptp_bind() Greg KH
2012-01-03 22:30 ` [53/67] llc: llc_cmsg_rcv was getting called after sk_eat_skb Greg KH
2012-01-03 22:30 ` [54/67] mqprio: Avoid panic if no options are provided Greg KH
2012-01-03 22:30 ` [55/67] net: have ipconfig not wait if no dev is available Greg KH
2012-01-03 22:30 ` [56/67] sch_gred: should not use GFP_KERNEL while holding a spinlock Greg KH
2012-01-03 22:30 ` [57/67] sctp: fix incorrect overflow check on autoclose Greg KH
2012-01-03 22:30 ` [58/67] sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd Greg KH
2012-01-03 22:30 ` [59/67] ipv4: flush route cache after change accept_local Greg KH
2012-01-03 22:30 ` Greg KH [this message]
2012-01-03 22:30 ` [61/67] ipv4: using prefetch requires including prefetch.h Greg KH
2012-01-03 22:30 ` [62/67] iwlwifi: update SCD BC table for all SCD queues Greg KH
2012-01-03 22:30 ` [63/67] mfd: Fix mismatch in twl4030 mutex lock-unlock Greg KH
2012-01-03 22:30 ` [64/67] mfd: Copy the device pointer to the twl4030-madc structure Greg KH
2012-01-03 22:30 ` [65/67] mfd: Check for twl4030-madc NULL pointer Greg KH
2012-01-03 22:30 ` [66/67] mfd: Turn on the twl4030-madc MADC clock Greg KH
2012-01-03 22:30 ` [67/67] xen/swiotlb: Use page alignment for early buffer allocation Greg KH
2012-01-05 19:18 ` [00/67] 3.0.16-stable review Greg KH
2012-01-05 19:26   ` Greg KH
2012-01-05 22:46   ` Greg KH
2012-01-05 22:44     ` [40/73] mpt2sas: fix non-x86 crash on shutdown Greg KH
2012-01-05 22:44     ` [67/73] xfs: log the inode in ->write_inode calls for kupdate Greg KH
2012-01-05 22:44     ` [68/73] xfs: log all dirty inodes in xfs_fs_sync_fs Greg KH
2012-01-05 22:44     ` [69/73] drm/radeon/kms/atom: fix possible segfault in pm setup Greg KH
2012-01-05 22:44     ` [70/73] hung_task: fix false positive during vfork Greg KH
2012-01-05 22:44     ` [71/73] Revert "rtc: Disable the alarm in the hardware" Greg KH
2012-01-05 22:44     ` [72/73] ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race Greg KH
2012-01-05 22:44     ` [73/73] ath9k: Fix kernel panic in AR2427 in AP mode Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120103223036.382195374@clark.kroah.org \
    --to=gregkh@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=davem@davemloft.net \
    --cc=eric.dumazet@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).