linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Waiman Long <longman@redhat.com>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Jonathan Corbet <corbet@lwn.net>,
	"Luis R. Rodriguez" <mcgrof@kernel.org>,
	Kees Cook <keescook@chromium.org>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, linux-doc@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Jan Kara <jack@suse.cz>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Ingo Molnar <mingo@kernel.org>,
	Miklos Szeredi <mszeredi@redhat.com>,
	Matthew Wilcox <willy@infradead.org>,
	Larry Woodman <lwoodman@redhat.com>,
	James Bottomley <James.Bottomley@HansenPartnership.com>,
	"Wangkai (Kevin C)" <wangkai86@huawei.com>,
	Waiman Long <longman@redhat.com>
Subject: [PATCH v6 3/7] fs/dcache: Enable automatic pruning of negative dentries
Date: Fri,  6 Jul 2018 15:32:48 -0400	[thread overview]
Message-ID: <1530905572-817-4-git-send-email-longman@redhat.com> (raw)
In-Reply-To: <1530905572-817-1-git-send-email-longman@redhat.com>

It is not good enough to have a soft limit for the number of
negative dentries in the system and print a warning if that limit is
exceeded. We need to do something about it when this happens.

This patch enables automatic pruning of negative dentries when
neg-dentry-pc sysctl parameter is non-zero and the soft limit is going
to be exceeded.  This is done by using the workqueue API to do the
pruning gradually when a threshold is reached to minimize performance
impact on other running tasks.

The current threshold is 1/4 of the initial value of the free pool
count. Once the threshold is reached, the automatic pruning process
will be kicked in to replenish the free pool. Each pruning run will
scan 64 dentries per LRU list and can remove up to 256 negative
dentries to minimize the LRU locks hold time. The pruning rate will
be 50 Hz if the free pool count is less than 1/8 of the original and
10 Hz otherwise.

The dentry pruning operation may also free some least recently used
positive dentries.

In the unlikely event that a superblock is being umount'ed while in
negative dentry pruning mode, the umount may face an additional delay
of up to 0.1s.

This negative dentry shrinker is supposed to be run in the background
with minimal performance impact. So it does not remove excess negative
dentries as fast as the regular memory shrinker when the system is
under high memory pressure.  This negative dentry removal rate should
be enough under normal circumstances. In the extreme case that the
negative dentry generation rate is too high, both this shrinker and
the regular memory shrinker may be running at the same time when the
amount of free memory is too low.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 fs/dcache.c              | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/list_lru.h |   1 +
 mm/list_lru.c            |   4 +-
 3 files changed, 159 insertions(+), 1 deletion(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 175012b..ac25029 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -137,6 +137,11 @@ struct dentry_stat_t dentry_stat = {
  * the extra ones will be returned back to the global pool.
  */
 #define NEG_DENTRY_BATCH	(1 << 8)
+#define NEG_PRUNING_SIZE	(1 << 6)
+#define NEG_PRUNING_SLOW_RATE	(HZ/10)
+#define NEG_PRUNING_FAST_RATE	(HZ/50)
+#define NEG_IS_SB_UMOUNTING(sb)	\
+	unlikely(!(sb)->s_root || !((sb)->s_flags & MS_ACTIVE))
 
 static struct static_key limit_neg_key = STATIC_KEY_INIT_FALSE;
 static int neg_dentry_pc_old;
@@ -147,10 +152,18 @@ struct dentry_stat_t dentry_stat = {
 static long neg_dentry_nfree_init __read_mostly; /* Free pool initial value */
 static struct {
 	raw_spinlock_t nfree_lock;
+	int niter;			/* Pruning iteration count */
+	int lru_count;			/* Per-LRU pruning count */
+	long n_neg;			/* # of negative dentries pruned */
+	long n_pos;			/* # of positive dentries pruned */
 	long nfree;			/* Negative dentry free pool */
+	struct super_block *prune_sb;	/* Super_block for pruning */
 } ndblk ____cacheline_aligned_in_smp;
 proc_handler proc_neg_dentry_pc;
 
+static void prune_negative_dentry(struct work_struct *work);
+static DECLARE_DELAYED_WORK(prune_neg_dentry_work, prune_negative_dentry);
+
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
 static DEFINE_PER_CPU(long, nr_dentry_neg);
@@ -338,6 +351,25 @@ static void __neg_dentry_inc(struct dentry *dentry)
 	 */
 	if (!cnt)
 		pr_warn_once("Too many negative dentries.");
+
+	/*
+	 * Initiate negative dentry pruning if free pool has less than
+	 * 1/4 of its initial value.
+	 */
+	if ((READ_ONCE(ndblk.nfree) < READ_ONCE(neg_dentry_nfree_init)/4) &&
+	    !READ_ONCE(ndblk.prune_sb) &&
+	    !cmpxchg(&ndblk.prune_sb, NULL, dentry->d_sb)) {
+		/*
+		 * Abort if umounting is in progress, otherwise take a
+		 * reference and move on.
+		 */
+		if (NEG_IS_SB_UMOUNTING(ndblk.prune_sb)) {
+			WRITE_ONCE(ndblk.prune_sb, NULL);
+		} else {
+			atomic_inc(&ndblk.prune_sb->s_active);
+			schedule_delayed_work(&prune_neg_dentry_work, 1);
+		}
+	}
 }
 
 static inline void neg_dentry_inc(struct dentry *dentry)
@@ -1411,6 +1443,129 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
+/*
+ * A modified version that attempts to remove a limited number of negative
+ * dentries as well as some other non-negative dentries at the front.
+ */
+static enum lru_status dentry_negative_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+	enum lru_status	status = LRU_SKIP;
+
+	/*
+	 * Limit amount of dentry walking in each LRU list.
+	 */
+	if (ndblk.lru_count >= NEG_PRUNING_SIZE) {
+		ndblk.lru_count = 0;
+		return LRU_STOP;
+	}
+	ndblk.lru_count++;
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(lru, dentry);
+		status = LRU_REMOVED;
+		goto out;
+	}
+
+	/*
+	 * Dentries with reference bit on are moved back to the tail.
+	 */
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		status = LRU_ROTATE;
+		goto out;
+	}
+
+	status = LRU_REMOVED;
+	d_lru_shrink_move(lru, dentry, freeable);
+	if (d_is_negative(dentry))
+		ndblk.n_neg++;
+out:
+	spin_unlock(&dentry->d_lock);
+	return status;
+}
+
+/*
+ * A workqueue function to prune negative dentry.
+ *
+ * The pruning is done gradually over time so as to have as little
+ * performance impact as possible.
+ */
+static void prune_negative_dentry(struct work_struct *work)
+{
+	int freed, last_n_neg;
+	long nfree;
+	struct super_block *sb = READ_ONCE(ndblk.prune_sb);
+	LIST_HEAD(dispose);
+
+	if (!sb)
+		return;
+	if (NEG_IS_SB_UMOUNTING(sb) || !READ_ONCE(neg_dentry_pc))
+		goto stop_pruning;
+
+	ndblk.niter++;
+	ndblk.lru_count = 0;
+	last_n_neg = ndblk.n_neg;
+	freed = list_lru_walk(&sb->s_dentry_lru, dentry_negative_lru_isolate,
+			      &dispose, NEG_DENTRY_BATCH);
+
+	if (freed)
+		shrink_dentry_list(&dispose);
+	ndblk.n_pos += freed - (ndblk.n_neg - last_n_neg);
+
+	/*
+	 * Continue delayed pruning until negative dentry free pool is at
+	 * least 1/2 of the initial value, the super_block has no more
+	 * negative dentries left at the front, or unmounting is in
+	 * progress.
+	 *
+	 * The pruning rate depends on the size of the free pool. The
+	 * faster rate is used when there is less than 1/8 left.
+	 * Otherwise, the slower rate will be used.
+	 */
+	nfree = READ_ONCE(ndblk.nfree);
+	if ((ndblk.n_neg == last_n_neg) ||
+	    (nfree >= neg_dentry_nfree_init/2) || NEG_IS_SB_UMOUNTING(sb))
+		goto stop_pruning;
+
+	schedule_delayed_work(&prune_neg_dentry_work,
+			     (nfree < neg_dentry_nfree_init/8)
+			     ? NEG_PRUNING_FAST_RATE : NEG_PRUNING_SLOW_RATE);
+	return;
+
+stop_pruning:
+#ifdef CONFIG_DEBUG_KERNEL
+	/*
+	 * Report large negative dentry pruning event.
+	 */
+	if (ndblk.n_neg > NEG_PRUNING_SIZE) {
+		pr_info("Negative dentry pruning (SB=%s):\n\t"
+			"%d iterations, %ld/%ld neg/pos dentries freed.\n",
+			ndblk.prune_sb->s_id, ndblk.niter, ndblk.n_neg,
+			ndblk.n_pos);
+	}
+#endif
+	ndblk.niter = 0;
+	ndblk.n_neg = ndblk.n_pos = 0;
+	deactivate_super(sb);
+	WRITE_ONCE(ndblk.prune_sb, NULL);
+}
+
 /**
  * enum d_walk_ret - action to talke during tree walk
  * @D_WALK_CONTINUE:	contrinue walk
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 96def9d..a9598a0 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -23,6 +23,7 @@ enum lru_status {
 	LRU_SKIP,		/* item cannot be locked, skip */
 	LRU_RETRY,		/* item not freeable. May drop the lock
 				   internally, but has to return locked. */
+	LRU_STOP,		/* stop walking the list */
 };
 
 struct list_lru_one {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c8..2ee5d3a 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -246,11 +246,13 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 			 */
 			assert_spin_locked(&nlru->lock);
 			goto restart;
+		case LRU_STOP:
+			goto out;
 		default:
 			BUG();
 		}
 	}
-
+out:
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
-- 
1.8.3.1


  parent reply	other threads:[~2018-07-06 19:36 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
2018-07-06 19:32 ` [PATCH v6 1/7] fs/dcache: Track & report number " Waiman Long
2018-07-06 19:32 ` [PATCH v6 2/7] fs/dcache: Add sysctl parameter neg-dentry-pc as a soft limit on " Waiman Long
2018-07-06 19:32 ` Waiman Long [this message]
2018-07-06 19:32 ` [PATCH v6 4/7] fs/dcache: Spread negative dentry pruning across multiple CPUs Waiman Long
2018-07-06 19:32 ` [PATCH v6 5/7] fs/dcache: Add negative dentries to LRU head initially Waiman Long
2018-07-06 19:32 ` [PATCH v6 6/7] fs/dcache: Allow optional enforcement of negative dentry limit Waiman Long
2018-07-06 19:32 ` [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size Waiman Long
2018-07-06 21:54   ` Eric Biggers
2018-07-06 22:28 ` [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Al Viro
2018-07-07  3:02   ` Waiman Long
2018-07-09  8:19 ` Michal Hocko
2018-07-09 16:01   ` Waiman Long
2018-07-10 14:27     ` Michal Hocko
2018-07-10 16:09       ` Waiman Long
2018-07-11 10:21         ` Michal Hocko
2018-07-11 15:13           ` Waiman Long
2018-07-11 17:42             ` James Bottomley
2018-07-11 19:07               ` Waiman Long
2018-07-11 19:21                 ` James Bottomley
2018-07-12 15:54                   ` Waiman Long
2018-07-12 16:04                     ` James Bottomley
2018-07-12 16:26                       ` Waiman Long
2018-07-12 17:33                         ` James Bottomley
2018-07-13 15:32                           ` Waiman Long
2018-07-12 16:49                       ` Matthew Wilcox
2018-07-12 17:21                         ` James Bottomley
2018-07-12 18:06                           ` Linus Torvalds
2018-07-12 19:57                             ` James Bottomley
2018-07-13  0:36                               ` Dave Chinner
2018-07-13 15:46                                 ` James Bottomley
2018-07-13 23:17                                   ` Dave Chinner
2018-07-16  9:10                                   ` Michal Hocko
2018-07-16 14:42                                     ` James Bottomley
2018-07-16  9:09                                 ` Michal Hocko
2018-07-16  9:12                                   ` Michal Hocko
2018-07-16 12:41                                   ` Matthew Wilcox
2018-07-16 23:40                                     ` Andrew Morton
2018-07-17  1:30                                       ` Matthew Wilcox
2018-07-17  8:33                                       ` Michal Hocko
2018-07-19  0:33                                         ` Dave Chinner
2018-07-19  8:45                                           ` Michal Hocko
2018-07-19  9:13                                             ` Jan Kara
2018-07-18 18:39                                       ` Waiman Long
2018-07-18 16:17                                   ` Waiman Long
2018-07-19  8:48                                     ` Michal Hocko
2018-07-12  8:48             ` Michal Hocko
2018-07-12 16:12               ` Waiman Long
2018-07-12 23:16                 ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1530905572-817-4-git-send-email-longman@redhat.com \
    --to=longman@redhat.com \
    --cc=James.Bottomley@HansenPartnership.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=jack@suse.cz \
    --cc=keescook@chromium.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lwoodman@redhat.com \
    --cc=mcgrof@kernel.org \
    --cc=mingo@kernel.org \
    --cc=mszeredi@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wangkai86@huawei.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).