All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com,
	hch@infradead.org, bfoster@redhat.com
Subject: [PATCH 12/16] xfs: parallelize inode inactivation
Date: Sun, 13 Jun 2021 10:21:02 -0700	[thread overview]
Message-ID: <162360486288.1530792.18351614470122965770.stgit@locust> (raw)
In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust>

From: Darrick J. Wong <djwong@kernel.org>

Split the inode inactivation work into per-AG work items so that we can
take advantage of parallelization.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ag.c |    3 +
 fs/xfs/libxfs/xfs_ag.h |    3 +
 fs/xfs/xfs_icache.c    |  101 ++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_mount.c     |   11 +++--
 fs/xfs/xfs_mount.h     |    2 -
 fs/xfs/xfs_super.c     |    1 
 fs/xfs/xfs_trace.h     |    8 ++--
 7 files changed, 97 insertions(+), 32 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 0765a0ba30e1..7652d90d7d0d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -173,6 +173,7 @@ __xfs_free_perag(
 	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
 
 	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+	ASSERT(!delayed_work_pending(&pag->pag_inodegc_work));
 	ASSERT(atomic_read(&pag->pag_ref) == 0);
 	kmem_free(pag);
 }
@@ -195,6 +196,7 @@ xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
 		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 
@@ -253,6 +255,7 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pagb_lock);
 		spin_lock_init(&pag->pag_state_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+		INIT_DELAYED_WORK(&pag->pag_inodegc_work, xfs_inodegc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		init_waitqueue_head(&pag->pagb_wait);
 		pag->pagb_count = 0;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4c6f9045baca..3929ea35b0d4 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -96,6 +96,9 @@ struct xfs_perag {
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 
+	/* background inode inactivation */
+	struct delayed_work	pag_inodegc_work;
+
 	/*
 	 * Unlinked inode information.  This incore information reflects
 	 * data stored in the AGI, so callers must hold the AGI buffer lock
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 210a9e3cd19e..f58d0455e38f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -299,6 +299,43 @@ xfs_worker_delay_freesp(
 	return delay_ms >> shift;
 }
 
+/*
+ * Scale down the background work delay if we're low on free space in this AG.
+ * Similar to the way that we throttle preallocations, we halve the delay time
+ * for every low free space threshold that isn't met.  Return value is in ms.
+ */
+static inline unsigned int
+xfs_work_delay_perag(
+	struct xfs_perag	*pag,
+	unsigned int		delay_ms)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	xfs_extlen_t		freesp;
+	unsigned int		shift = 0;
+
+	if (!pag->pagf_init)
+		return delay_ms;
+
+	/* Free space in this AG that can be allocated to file data */
+	freesp = pag->pagf_freeblks + pag->pagf_flcount;
+	freesp -= (pag->pag_meta_resv.ar_reserved +
+		   pag->pag_rmapbt_resv.ar_reserved);
+
+	if (freesp < mp->m_ag_low_space[XFS_LOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_4_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_3_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_2_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_1_PCNT])
+			shift++;
+	}
+
+	return delay_ms >> shift;
+}
+
 /*
  * Compute the lag between scheduling and executing background work based on
  * free space in the filesystem.  If an inode is passed in, its dquots will
@@ -306,18 +343,20 @@ xfs_worker_delay_freesp(
  */
 static inline unsigned int
 xfs_worker_delay_ms(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
 	unsigned int		default_ms)
 {
-	unsigned int		udelay, gdelay, pdelay, fdelay;
+	struct xfs_mount	*mp = pag->pag_mount;
+	unsigned int		udelay, gdelay, pdelay, fdelay, adelay;
 
 	udelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_USER, default_ms);
 	gdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_GROUP, default_ms);
 	pdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_PROJ, default_ms);
 	fdelay = xfs_worker_delay_freesp(mp, default_ms);
+	adelay = xfs_work_delay_perag(pag, default_ms);
 
-	return min(min(udelay, gdelay), min(pdelay, fdelay));
+	return min(adelay, min(min(udelay, gdelay), min(pdelay, fdelay)));
 }
 
 /*
@@ -343,9 +382,11 @@ xfs_blockgc_queue(
  */
 static void
 xfs_inodegc_queue(
-	struct xfs_mount        *mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
 {
+	struct xfs_mount        *mp = pag->pag_mount;
+
 	if (!test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
@@ -353,9 +394,9 @@ xfs_inodegc_queue(
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) {
 		unsigned int	delay;
 
-		delay = xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms);
-		trace_xfs_inodegc_queue(mp, delay, _RET_IP_);
-		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work,
+		delay = xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms);
+		trace_xfs_inodegc_queue(mp, pag->pag_agno, delay, _RET_IP_);
+		queue_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work,
 				msecs_to_jiffies(delay));
 	}
 	rcu_read_unlock();
@@ -367,11 +408,13 @@ xfs_inodegc_queue(
  */
 static void
 xfs_inodegc_queue_sooner(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
+
 	if (!XFS_IS_QUOTA_ON(mp) ||
-	    !delayed_work_pending(&mp->m_inodegc_work) ||
+	    !delayed_work_pending(&pag->pag_inodegc_work) ||
 	    !test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
@@ -379,11 +422,11 @@ xfs_inodegc_queue_sooner(
 	if (!radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
 		goto unlock;
 
-	if (xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms) == xfs_inodegc_ms)
+	if (xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms) == xfs_inodegc_ms)
 		goto unlock;
 
-	trace_xfs_inodegc_queue(mp, 0, _RET_IP_);
-	queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+	trace_xfs_inodegc_queue(mp, pag->pag_agno, 0, _RET_IP_);
+	mod_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, 0);
 unlock:
 	rcu_read_unlock();
 }
@@ -427,7 +470,7 @@ xfs_perag_set_inode_tag(
 		xfs_blockgc_queue(pag);
 		break;
 	case XFS_ICI_INODEGC_TAG:
-		xfs_inodegc_queue(mp, ip);
+		xfs_inodegc_queue(pag, ip);
 		break;
 	}
 
@@ -561,7 +604,7 @@ xfs_inode_mark_reclaimable(
 	spin_unlock(&pag->pag_ici_lock);
 
 	if (need_inactive && already_queued)
-		xfs_inodegc_queue_sooner(mp, ip);
+		xfs_inodegc_queue_sooner(pag, ip);
 
 	xfs_perag_put(pag);
 }
@@ -2058,16 +2101,17 @@ void
 xfs_inodegc_worker(
 	struct work_struct	*work)
 {
-	struct xfs_mount	*mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_inodegc_work);
+	struct xfs_perag	*pag = container_of(to_delayed_work(work),
+					struct xfs_perag, pag_inodegc_work);
+	struct xfs_mount	*mp = pag->pag_mount;
 
 	/*
 	 * Inactivation never returns error codes and never fails to push a
 	 * tagged inode to reclaim.  Loop until there there's nothing left.
 	 */
-	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) {
-		trace_xfs_inodegc_worker(mp, 0, _RET_IP_);
-		xfs_icwalk(mp, XFS_ICWALK_INODEGC, NULL);
+	while (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_INODEGC_TAG)) {
+		trace_xfs_inodegc_worker(mp, pag->pag_agno, 0, _RET_IP_);
+		xfs_icwalk_ag(pag, XFS_ICWALK_INODEGC, NULL);
 	}
 }
 
@@ -2079,8 +2123,13 @@ void
 xfs_inodegc_flush(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	trace_xfs_inodegc_flush(mp, 0, _RET_IP_);
-	flush_delayed_work(&mp->m_inodegc_work);
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG)
+		flush_delayed_work(&pag->pag_inodegc_work);
 }
 
 /* Disable the inode inactivation background worker and wait for it to stop. */
@@ -2088,10 +2137,14 @@ void
 xfs_inodegc_stop(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	if (!test_and_clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
-	cancel_delayed_work_sync(&mp->m_inodegc_work);
+	for_each_perag(mp, agno, pag)
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
 	trace_xfs_inodegc_stop(mp, 0, _RET_IP_);
 }
 
@@ -2103,11 +2156,15 @@ void
 xfs_inodegc_start(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	if (test_and_set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
 	trace_xfs_inodegc_start(mp, 0, _RET_IP_);
-	xfs_inodegc_queue(mp, NULL);
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG)
+		xfs_inodegc_queue(pag, NULL);
 }
 
 /* XFS Inode Cache Walking Code */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ab65a14e51e6..eff375f92005 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -365,13 +365,16 @@ void
 xfs_set_low_space_thresholds(
 	struct xfs_mount	*mp)
 {
-	int i;
+	uint64_t		space = mp->m_sb.sb_dblocks;
+	uint32_t		ag_space = mp->m_sb.sb_agblocks;
+	int			i;
+
+	do_div(space, 100);
+	do_div(ag_space, 100);
 
 	for (i = 0; i < XFS_LOWSP_MAX; i++) {
-		uint64_t space = mp->m_sb.sb_dblocks;
-
-		do_div(space, 100);
 		mp->m_low_space[i] = space * (i + 1);
+		mp->m_ag_low_space[i] = ag_space * (i + 1);
 	}
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dc906b78e24c..154aa95d968c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -131,6 +131,7 @@ typedef struct xfs_mount {
 	uint			m_rsumsize;	/* size of rt summary, bytes */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_qflags;	/* quota status flags */
+	int32_t			m_ag_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_flags;	/* global mount flags */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
@@ -191,7 +192,6 @@ typedef struct xfs_mount {
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct delayed_work	m_inodegc_work; /* background inode inactive */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 45ef63b5b2f0..66b61d38f401 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1865,7 +1865,6 @@ static int xfs_init_fs_context(
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-	INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ca9bfbd28886..404f2f32002f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -122,7 +122,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
 		  __entry->error, __entry->function)
 );
 
-DECLARE_EVENT_CLASS(xfs_perag_class,
+DECLARE_EVENT_CLASS(xfs_perag_ref_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
 		 unsigned long caller_ip),
 	TP_ARGS(mp, agno, refcount, caller_ip),
@@ -146,7 +146,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
 );
 
 #define DEFINE_PERAG_REF_EVENT(name)	\
-DEFINE_EVENT(xfs_perag_class, name,	\
+DEFINE_EVENT(xfs_perag_ref_class, name,	\
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
 		 unsigned long caller_ip),					\
 	TP_ARGS(mp, agno, refcount, caller_ip))
@@ -155,6 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_inodegc_queue);
+DEFINE_PERAG_REF_EVENT(xfs_inodegc_worker);
 
 DECLARE_EVENT_CLASS(xfs_fs_class,
 	TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip),
@@ -191,8 +193,6 @@ DEFINE_EVENT(xfs_fs_class, name,					\
 DEFINE_FS_EVENT(xfs_inodegc_flush);
 DEFINE_FS_EVENT(xfs_inodegc_start);
 DEFINE_FS_EVENT(xfs_inodegc_stop);
-DEFINE_FS_EVENT(xfs_inodegc_queue);
-DEFINE_FS_EVENT(xfs_inodegc_worker);
 DEFINE_FS_EVENT(xfs_fs_sync_fs);
 
 DECLARE_EVENT_CLASS(xfs_ag_class,


  parent reply	other threads:[~2021-06-13 17:21 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-13 17:19 [PATCHSET v7 00/16] xfs: deferred inode inactivation Darrick J. Wong
2021-06-13 17:20 ` [PATCH 01/16] xfs: refactor the inode recycling code Darrick J. Wong
2021-06-16  8:13   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 02/16] xfs: move xfs_inactive call to xfs_inode_mark_reclaimable Darrick J. Wong
2021-06-16  8:18   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 03/16] xfs: detach dquots from inode if we don't need to inactivate it Darrick J. Wong
2021-06-14 16:13   ` Brian Foster
2021-06-14 17:27     ` Darrick J. Wong
2021-06-16  8:21   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 04/16] xfs: clean up xfs_inactive a little bit Darrick J. Wong
2021-06-14 16:14   ` Brian Foster
2021-06-14 17:34     ` Darrick J. Wong
2021-06-16  8:23   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 05/16] xfs: separate primary inode selection criteria in xfs_iget_cache_hit Darrick J. Wong
2021-06-14 16:14   ` Brian Foster
2021-06-16  8:26   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 06/16] xfs: defer inode inactivation to a workqueue Darrick J. Wong
2021-06-14 16:17   ` Brian Foster
2021-06-14 19:27     ` Darrick J. Wong
2021-06-15 14:43       ` Brian Foster
2021-06-15 20:53         ` Darrick J. Wong
2021-06-17 14:23           ` Brian Foster
2021-06-17 18:41             ` Darrick J. Wong
2021-06-18 14:00               ` Brian Foster
2021-06-18 14:58                 ` Darrick J. Wong
2021-06-21  5:19                   ` Christoph Hellwig
2021-06-13 17:20 ` [PATCH 07/16] xfs: drop dead dquots before scheduling inode for inactivation Darrick J. Wong
2021-06-13 17:20 ` [PATCH 08/16] xfs: expose sysfs knob to control inode inactivation delay Darrick J. Wong
2021-06-13 17:20 ` [PATCH 09/16] xfs: reduce inactivation delay when things are tight Darrick J. Wong
2021-06-13 17:20 ` [PATCH 10/16] xfs: inactivate inodes any time we try to free speculative preallocations Darrick J. Wong
2021-06-13 17:20 ` [PATCH 11/16] xfs: flush inode inactivation work when compiling usage statistics Darrick J. Wong
2021-06-13 17:21 ` Darrick J. Wong [this message]
2021-06-13 17:21 ` [PATCH 13/16] xfs: don't run speculative preallocation gc when fs is frozen Darrick J. Wong
2021-06-13 17:21 ` [PATCH 14/16] xfs: scale speculative preallocation gc delay based on free space Darrick J. Wong
2021-06-13 17:21 ` [PATCH 15/16] xfs: use background worker pool when transactions can't get " Darrick J. Wong
2021-06-13 17:21 ` [PATCH 16/16] xfs: avoid buffer deadlocks when walking fs inodes Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=162360486288.1530792.18351614470122965770.stgit@locust \
    --to=djwong@kernel.org \
    --cc=bfoster@redhat.com \
    --cc=david@fromorbit.com \
    --cc=hch@infradead.org \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.