All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 00/10] xfs: incore unlinked list
@ 2019-02-04 17:58 Darrick J. Wong
  2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
                   ` (9 more replies)
  0 siblings, 10 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:58 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

This new patch series refactors the existing code that handles metadata
updates to the unlinked list when adding or removing inodes from that
list.  It then adds an in-core hashtable to record which inode's
next_unlinked field points to a given inode.  This enables us to remove
any inode from the on-disk unlinked structure without having to actually
walk the entire unlinked list, which reduces overhead substantially.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=incore-unlinked-list

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=incore-unlinked-list

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=incore-unlinked-list

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 01/10] xfs: clean up iunlink functions
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:51   ` Christoph Hellwig
  2019-02-04 17:59 ` [PATCH 02/10] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Fix some indentation issues with the iunlink functions and reorganize
the tops of the functions to be identical.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   79 +++++++++++++++++++++-------------------------------
 1 file changed, 32 insertions(+), 47 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8bf02be0003..2367cdfcd90b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1892,26 +1892,24 @@ xfs_inactive(
  */
 STATIC int
 xfs_iunlink(
-	struct xfs_trans *tp,
-	struct xfs_inode *ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_mount_t	*mp = tp->t_mountp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agino_t	agino;
-	short		bucket_index;
-	int		offset;
-	int		error;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*agibp;
+	struct xfs_buf		*ibp;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			offset;
+	int			error;
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
 
-	/*
-	 * Get the agi buffer first.  It ensures lock ordering
-	 * on the list.
-	 */
-	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(agibp);
@@ -1920,9 +1918,6 @@ xfs_iunlink(
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	ASSERT(agino != 0);
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	ASSERT(agi->agi_unlinked[bucket_index]);
 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
 
@@ -1969,45 +1964,35 @@ xfs_iunlink(
  */
 STATIC int
 xfs_iunlink_remove(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_ino_t	next_ino;
-	xfs_mount_t	*mp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agnumber_t	agno;
-	xfs_agino_t	agino;
-	xfs_agino_t	next_agino;
-	xfs_buf_t	*last_ibp;
-	xfs_dinode_t	*last_dip = NULL;
-	short		bucket_index;
-	int		offset, last_offset = 0;
-	int		error;
-
-	mp = tp->t_mountp;
-	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*agibp;
+	struct xfs_buf		*ibp;
+	struct xfs_buf		*last_ibp;
+	struct xfs_dinode	*last_dip = NULL;
+	xfs_ino_t		next_ino;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		next_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			offset;
+	int			last_offset = 0;
+	int			error;
 
-	/*
-	 * Get the agi buffer first.  It ensures lock ordering
-	 * on the list.
-	 */
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
-
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	if (!xfs_verify_agino(mp, agno, agino))
-		return -EFSCORRUPTED;
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	if (!xfs_verify_agino(mp, agno,
 			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
  2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 19:24   ` Brian Foster
  2019-02-04 17:59 ` [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper Darrick J. Wong
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Track the number of unlinked inodes in each AG so that we can use these
decisions to throttle inactivations when the unlinked list gets long.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c       |   35 ++++++++++++++++++++++++-----------
 fs/xfs/xfs_log_recover.c |    6 ++++++
 fs/xfs/xfs_mount.c       |    2 ++
 fs/xfs/xfs_mount.h       |    3 +++
 4 files changed, 35 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2367cdfcd90b..435901c7c674 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1900,6 +1900,7 @@ xfs_iunlink(
 	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
 	struct xfs_buf		*ibp;
+	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
@@ -1907,11 +1908,12 @@ xfs_iunlink(
 	int			error;
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
+	pag = xfs_perag_get(mp, agno);
 
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
-		return error;
+		goto out;
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
@@ -1931,7 +1933,7 @@ xfs_iunlink(
 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
 				       0, 0);
 		if (error)
-			return error;
+			goto out;
 
 		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
@@ -1956,7 +1958,10 @@ xfs_iunlink(
 		(sizeof(xfs_agino_t) * bucket_index);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
-	return 0;
+	pag->pagi_unlinked_count++;
+out:
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
@@ -1974,6 +1979,7 @@ xfs_iunlink_remove(
 	struct xfs_buf		*ibp;
 	struct xfs_buf		*last_ibp;
 	struct xfs_dinode	*last_dip = NULL;
+	struct xfs_perag	*pag;
 	xfs_ino_t		next_ino;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
@@ -1983,10 +1989,12 @@ xfs_iunlink_remove(
 	int			last_offset = 0;
 	int			error;
 
+	pag = xfs_perag_get(mp, agno);
+
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
-		return error;
+		goto out;
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
@@ -1997,7 +2005,8 @@ xfs_iunlink_remove(
 			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
-		return -EFSCORRUPTED;
+		error = -EFSCORRUPTED;
+		goto out;
 	}
 
 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
@@ -2013,7 +2022,7 @@ xfs_iunlink_remove(
 		if (error) {
 			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
 				__func__, error);
-			return error;
+			goto out;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
@@ -2062,7 +2071,7 @@ xfs_iunlink_remove(
 				xfs_warn(mp,
 	"%s: xfs_imap returned error %d.",
 					 __func__, error);
-				return error;
+				goto out;
 			}
 
 			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
@@ -2071,7 +2080,7 @@ xfs_iunlink_remove(
 				xfs_warn(mp,
 	"%s: xfs_imap_to_bp returned error %d.",
 					__func__, error);
-				return error;
+				goto out;
 			}
 
 			last_offset = imap.im_boffset;
@@ -2080,7 +2089,8 @@ xfs_iunlink_remove(
 				XFS_CORRUPTION_ERROR(__func__,
 						XFS_ERRLEVEL_LOW, mp,
 						last_dip, sizeof(*last_dip));
-				return -EFSCORRUPTED;
+				error = -EFSCORRUPTED;
+				goto out;
 			}
 		}
 
@@ -2093,7 +2103,7 @@ xfs_iunlink_remove(
 		if (error) {
 			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
 				__func__, error);
-			return error;
+			goto out;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
@@ -2128,7 +2138,10 @@ xfs_iunlink_remove(
 				  (offset + sizeof(xfs_agino_t) - 1));
 		xfs_inobp_check(mp, last_ibp);
 	}
-	return 0;
+	pag->pagi_unlinked_count--;
+out:
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ff9a27834c50..c634fbfea4a8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
 	struct xfs_buf			*ibp;
 	struct xfs_dinode		*dip;
 	struct xfs_inode		*ip;
+	struct xfs_perag		*pag;
 	xfs_ino_t			ino;
 	int				error;
 
@@ -5077,6 +5078,11 @@ xlog_recover_process_one_iunlink(
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 
+	/* Make sure the in-core data knows about this unlinked inode. */
+	pag = xfs_perag_get(mp, agno);
+	pag->pagi_unlinked_count++;
+	xfs_perag_put(pag);
+
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
 	 * the inode is dropped.
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10be706ec72e..6ca92a71c233 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,8 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		ASSERT(pag->pagi_unlinked_count == 0 ||
+		       XFS_FORCED_SHUTDOWN(mp));
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e344b1dfde63..169069a01f3c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -388,6 +388,9 @@ typedef struct xfs_perag {
 
 	/* reference count */
 	uint8_t			pagf_refcount_level;
+
+	/* unlinked inode info; lock AGI to access */
+	unsigned int		pagi_unlinked_count;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
  2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
  2019-02-04 17:59 ` [PATCH 02/10] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:53   ` Christoph Hellwig
  2019-02-04 17:59 ` [PATCH 04/10] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
                   ` (6 subsequent siblings)
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add a new helper to check that a per-AG inode pointer is either null or
points somewhere valid within that AG.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c |    3 +--
 fs/xfs/libxfs/xfs_types.c     |   13 +++++++++++++
 fs/xfs/libxfs/xfs_types.h     |    2 ++
 fs/xfs/scrub/agheader.c       |    8 +++-----
 4 files changed, 19 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8fa1050c1ae2..77e412a4b6ea 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -99,8 +99,7 @@ xfs_inode_buf_verify(
 		unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
 		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 			xfs_dinode_good_version(mp, dip->di_version) &&
-			(unlinked_ino == NULLAGINO ||
-			 xfs_verify_agino(mp, agno, unlinked_ino));
+			xfs_verify_agino_or_null(mp, agno, unlinked_ino);
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 						XFS_ERRTAG_ITOBP_INOTOBP))) {
 			if (readahead) {
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 0e35b4bdfef7..5b37c99b2811 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -115,6 +115,19 @@ xfs_verify_agino(
 	return agino >= first && agino <= last;
 }
 
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino)
+{
+	return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
 /*
  * Verify that an FS inode number pointer neither points outside the
  * filesystem nor points at static AG metadata.
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 704b4f308780..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,6 +183,8 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_agino_t *first, xfs_agino_t *last);
 bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agino_t agino);
 bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..9d4e8293d37e 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -864,19 +864,17 @@ xchk_agi(
 
 	/* Check inode pointers */
 	agino = be32_to_cpu(agi->agi_newino);
-	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(mp, agno, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	agino = be32_to_cpu(agi->agi_dirino);
-	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(mp, agno, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	/* Check unlinked inode buckets */
 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
 		agino = be32_to_cpu(agi->agi_unlinked[i]);
-		if (agino == NULLAGINO)
-			continue;
-		if (!xfs_verify_agino(mp, agno, agino))
+		if (!xfs_verify_agino_or_null(mp, agno, agino))
 			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 	}
 

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 04/10] xfs: refactor AGI unlinked bucket updates
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (2 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 19:25   ` Brian Foster
  2019-02-04 20:54   ` Christoph Hellwig
  2019-02-04 17:59 ` [PATCH 05/10] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Split the AGI unlinked bucket updates into a separate function.  No
functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   66 +++++++++++++++++++++++++++++++++++++---------------
 fs/xfs/xfs_trace.h |   26 ++++++++++++++++++++
 2 files changed, 73 insertions(+), 19 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 435901c7c674..8c4a1d198de8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1880,6 +1880,43 @@ xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+			old_value, new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino)
+		return -EFSCORRUPTED;
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+}
+
 /*
  * This is called when the inode's link count goes to 0 or we are creating a
  * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
@@ -1949,15 +1986,10 @@ xfs_iunlink(
 		xfs_inobp_check(mp, ibp);
 	}
 
-	/*
-	 * Point the bucket head pointer at the inode being inserted.
-	 */
-	ASSERT(agino != 0);
-	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-	offset = offsetof(xfs_agi_t, agi_unlinked) +
-		(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_log_buf(tp, agibp, offset,
-			  (offset + sizeof(xfs_agino_t) - 1));
+	/* Point the head of the list to point to this inode. */
+	error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+	if (error)
+		goto out;
 	pag->pagi_unlinked_count++;
 out:
 	xfs_perag_put(pag);
@@ -2041,16 +2073,12 @@ xfs_iunlink_remove(
 		} else {
 			xfs_trans_brelse(tp, ibp);
 		}
-		/*
-		 * Point the bucket head pointer at the next inode.
-		 */
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
-		offset = offsetof(xfs_agi_t, agi_unlinked) +
-			(sizeof(xfs_agino_t) * bucket_index);
-		xfs_trans_log_buf(tp, agibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
+
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+				next_agino);
+		if (error)
+			goto out;
 	} else {
 		/*
 		 * We need to search the list for the inode being freed.
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..c10478e7e49a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
 DEFINE_TRANS_EVENT(xfs_trans_add_item);
 DEFINE_TRANS_EVENT(xfs_trans_free_items);
 
+TRACE_EVENT(xfs_iunlink_update_bucket,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, bucket)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bucket = bucket;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bucket,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 05/10] xfs: strengthen AGI unlinked inode bucket pointer checks
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (3 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 04/10] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 17:59 ` [PATCH 06/10] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs, Brian Foster, Christoph Hellwig

From: Darrick J. Wong <darrick.wong@oracle.com>

Strengthen our checking of the AGI unlinked pointers when we start to
use them for updating the metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_inode.c |   27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8c4a1d198de8..9c17ae95b18f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1938,6 +1938,7 @@ xfs_iunlink(
 	struct xfs_buf		*agibp;
 	struct xfs_buf		*ibp;
 	struct xfs_perag	*pag;
+	xfs_agino_t		next_agino;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
@@ -1954,13 +1955,18 @@ xfs_iunlink(
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
 	 */
-	ASSERT(agi->agi_unlinked[bucket_index]);
-	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
+		error = -EFSCORRUPTED;
+		return error;
+	}
 
-	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
+	if (next_agino != NULLAGINO) {
 		/*
 		 * There is already another inode in the bucket we need
 		 * to add ourselves to.  Add us at the front of the list.
@@ -2030,18 +2036,18 @@ xfs_iunlink_remove(
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
 	 */
-	if (!xfs_verify_agino(mp, agno,
-			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(mp, agno, next_agino)) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
 		error = -EFSCORRUPTED;
 		goto out;
 	}
 
-	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
+	if (next_agino == agino) {
 		/*
 		 * We're at the head of the list.  Get the inode's on-disk
 		 * buffer to see if there is anyone after us on the list.
@@ -2083,7 +2089,6 @@ xfs_iunlink_remove(
 		/*
 		 * We need to search the list for the inode being freed.
 		 */
-		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
 		last_ibp = NULL;
 		while (next_agino != agino) {
 			struct xfs_imap	imap;

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 06/10] xfs: refactor inode unlinked pointer update functions
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (4 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 05/10] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 20:56   ` Christoph Hellwig
  2019-02-05 14:23   ` Brian Foster
  2019-02-04 17:59 ` [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function Darrick J. Wong
                   ` (3 subsequent siblings)
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Hoist the functions that update an inode's unlinked pointer updates into
a helper.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |  191 +++++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_trace.h |   26 +++++++
 2 files changed, 124 insertions(+), 93 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9c17ae95b18f..f23f13f0f2e6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1917,6 +1917,85 @@ xfs_iunlink_update_bucket(
 	return 0;
 }
 
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		*ibp,
+	struct xfs_dinode	*dip,
+	struct xfs_imap		*imap,
+	xfs_ino_t		ino,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+	trace_xfs_iunlink_update_dinode(mp, agno, XFS_INO_TO_AGINO(mp, ino),
+			be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(next_agino);
+	offset = imap->im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	/* need to recalc the inode CRC if appropriate */
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		next_agino,
+	xfs_agino_t		*old_next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	xfs_agino_t		old_value;
+	int			error;
+
+	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+	if (error)
+		return error;
+
+	/* Make sure the old pointer isn't garbage. */
+	old_value = be32_to_cpu(dip->di_next_unlinked);
+	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	*old_next_agino = old_value;
+	if (old_value == next_agino) {
+		if (next_agino != NULLAGINO)
+			error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Ok, update the new pointer. */
+	xfs_iunlink_update_dinode(tp, agno, ibp, dip, &ip->i_imap, ip->i_ino,
+			next_agino);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
 /*
  * This is called when the inode's link count goes to 0 or we are creating a
  * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
@@ -1934,15 +2013,12 @@ xfs_iunlink(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agi		*agi;
-	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*ibp;
 	struct xfs_perag	*pag;
 	xfs_agino_t		next_agino;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			offset;
 	int			error;
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
@@ -1963,33 +2039,21 @@ xfs_iunlink(
 	if (next_agino == agino ||
 	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
 		error = -EFSCORRUPTED;
-		return error;
+		goto out;
 	}
 
 	if (next_agino != NULLAGINO) {
+		xfs_agino_t	old_agino;
+
 		/*
-		 * There is already another inode in the bucket we need
-		 * to add ourselves to.  Add us at the front of the list.
-		 * Here we put the head pointer into our next pointer,
-		 * and then we fall through to point the head at us.
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
+		error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+				&old_agino);
 		if (error)
 			goto out;
-
-		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
-		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-		offset = ip->i_imap.im_boffset +
-			offsetof(xfs_dinode_t, di_next_unlinked);
-
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, dip);
-
-		xfs_trans_inode_buf(tp, ibp);
-		xfs_trans_log_buf(tp, ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, ibp);
+		ASSERT(old_agino == NULLAGINO);
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2012,9 +2076,7 @@ xfs_iunlink_remove(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agi		*agi;
-	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*ibp;
 	struct xfs_buf		*last_ibp;
 	struct xfs_dinode	*last_dip = NULL;
 	struct xfs_perag	*pag;
@@ -2023,8 +2085,6 @@ xfs_iunlink_remove(
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agino_t		next_agino;
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			offset;
-	int			last_offset = 0;
 	int			error;
 
 	pag = xfs_perag_get(mp, agno);
@@ -2051,34 +2111,11 @@ xfs_iunlink_remove(
 		/*
 		 * We're at the head of the list.  Get the inode's on-disk
 		 * buffer to see if there is anyone after us on the list.
-		 * Only modify our next pointer if it is not already NULLAGINO.
-		 * This saves us the overhead of dealing with the buffer when
-		 * there is no need to change it.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-				__func__, error);
+		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
+				&next_agino);
+		if (error)
 			goto out;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
-
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
 
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
@@ -2086,13 +2123,13 @@ xfs_iunlink_remove(
 		if (error)
 			goto out;
 	} else {
+		struct xfs_imap	imap;
+
 		/*
 		 * We need to search the list for the inode being freed.
 		 */
 		last_ibp = NULL;
 		while (next_agino != agino) {
-			struct xfs_imap	imap;
-
 			if (last_ibp)
 				xfs_trans_brelse(tp, last_ibp);
 
@@ -2116,7 +2153,6 @@ xfs_iunlink_remove(
 				goto out;
 			}
 
-			last_offset = imap.im_boffset;
 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
 			if (!xfs_verify_agino(mp, agno, next_agino)) {
 				XFS_CORRUPTION_ERROR(__func__,
@@ -2131,45 +2167,14 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on the
 		 * unlinked list.  Pull us from the list.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
-				__func__, error);
+		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
+				&next_agino);
+		if (error)
 			goto out;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
-
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
-		/*
-		 * Point the previous inode on the list to the next inode.
-		 */
-		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
-		ASSERT(next_agino != 0);
-		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, last_dip);
 
-		xfs_trans_inode_buf(tp, last_ibp);
-		xfs_trans_log_buf(tp, last_ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, last_ibp);
+		/* Point the previous inode on the list to the next inode. */
+		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
+				next_ino, next_agino);
 	}
 	pag->pagi_unlinked_count--;
 out:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c10478e7e49a..fbec8f0e1a9a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
 		  __entry->new_ptr)
 );
 
+TRACE_EVENT(xfs_iunlink_update_dinode,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (5 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 06/10] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 20:58   ` Christoph Hellwig
  2019-02-04 17:59 ` [PATCH 08/10] xfs: refactor inode update in iunlink_remove Darrick J. Wong
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

There's a loop that searches an unlinked bucket list to find the inode
that points to a given inode.  Hoist this into a separate function;
later we'll use our iunlink backref cache to bypass the slow list
operation.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |  145 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 106 insertions(+), 39 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f23f13f0f2e6..8af5f4e989ac 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2066,6 +2066,105 @@ xfs_iunlink(
 	return error;
 }
 
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error;
+
+	imap->im_blkno = 0;
+	error = xfs_imap(mp, tp, ino, imap, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino.  Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @ino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_agino_t		head_agino,
+	xfs_agino_t		target_agino,
+	xfs_ino_t		*ino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*last_ibp = NULL;
+	struct xfs_dinode	*last_dip;
+	xfs_ino_t		next_ino = NULLFSINO;
+	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(head_agino != target_agino);
+
+	next_agino = head_agino;
+	while (next_agino != target_agino) {
+		xfs_agino_t	unlinked_agino;
+
+		if (last_ibp)
+			xfs_trans_brelse(tp, last_ibp);
+
+		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
+				&last_ibp);
+		if (error)
+			return error;
+
+		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
+		/*
+		 * Make sure this pointer is valid and isn't an obvious
+		 * infinite loop.
+		 */
+		if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
+		    next_agino == unlinked_agino) {
+			XFS_CORRUPTION_ERROR(__func__,
+					XFS_ERRLEVEL_LOW, mp,
+					last_dip, sizeof(*last_dip));
+			error = -EFSCORRUPTED;
+			return error;
+		}
+		next_agino = unlinked_agino;
+	}
+
+	/* Should never happen, but don't return garbage. */
+	if (next_ino == NULLFSINO)
+		return -EFSCORRUPTED;
+
+	*ino = next_ino;
+	*dipp = last_dip;
+	*bpp = last_ibp;
+	return 0;
+}
+
 /*
  * Pull the on-disk inode from the AGI unlinked list.
  */
@@ -2080,7 +2179,7 @@ xfs_iunlink_remove(
 	struct xfs_buf		*last_ibp;
 	struct xfs_dinode	*last_dip = NULL;
 	struct xfs_perag	*pag;
-	xfs_ino_t		next_ino;
+	xfs_ino_t		prev_ino;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agino_t		next_agino;
@@ -2125,43 +2224,11 @@ xfs_iunlink_remove(
 	} else {
 		struct xfs_imap	imap;
 
-		/*
-		 * We need to search the list for the inode being freed.
-		 */
-		last_ibp = NULL;
-		while (next_agino != agino) {
-			if (last_ibp)
-				xfs_trans_brelse(tp, last_ibp);
-
-			imap.im_blkno = 0;
-			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
-
-			error = xfs_imap(mp, tp, next_ino, &imap, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap returned error %d.",
-					 __func__, error);
-				goto out;
-			}
-
-			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
-					       &last_ibp, 0, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap_to_bp returned error %d.",
-					__func__, error);
-				goto out;
-			}
-
-			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
-			if (!xfs_verify_agino(mp, agno, next_agino)) {
-				XFS_CORRUPTION_ERROR(__func__,
-						XFS_ERRLEVEL_LOW, mp,
-						last_dip, sizeof(*last_dip));
-				error = -EFSCORRUPTED;
-				goto out;
-			}
-		}
+		/* We need to search the list for the inode being freed. */
+		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
+				&prev_ino, &imap, &last_dip, &last_ibp);
+		if (error)
+			goto out;
 
 		/*
 		 * Now last_ibp points to the buffer previous to us on the
@@ -2174,7 +2241,7 @@ xfs_iunlink_remove(
 
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
-				next_ino, next_agino);
+				prev_ino, next_agino);
 	}
 	pag->pagi_unlinked_count--;
 out:

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 08/10] xfs: refactor inode update in iunlink_remove
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (6 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 20:58   ` Christoph Hellwig
  2019-02-05 14:23   ` Brian Foster
  2019-02-04 17:59 ` [PATCH 09/10] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
  2019-02-04 18:00 ` [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

In xfs_iunlink_remove we have two identical calls to
xfs_iunlink_update_inode, so move it out of the if statement to simplify
the code some more.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8af5f4e989ac..fac9562ecd39 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2183,6 +2183,7 @@ xfs_iunlink_remove(
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agino_t		next_agino;
+	xfs_agino_t		head_agino;
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
@@ -2198,24 +2199,24 @@ xfs_iunlink_remove(
 	 * Get the index into the agi hash table for the list this inode will
 	 * go on.  Make sure the head pointer isn't garbage.
 	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (!xfs_verify_agino(mp, agno, next_agino)) {
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(mp, agno, head_agino)) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
 		error = -EFSCORRUPTED;
 		goto out;
 	}
 
-	if (next_agino == agino) {
-		/*
-		 * We're at the head of the list.  Get the inode's on-disk
-		 * buffer to see if there is anyone after us on the list.
-		 */
-		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
-				&next_agino);
-		if (error)
-			goto out;
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+	if (error)
+		goto out;
 
+	if (head_agino == agino) {
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
 				next_agino);
@@ -2225,20 +2226,11 @@ xfs_iunlink_remove(
 		struct xfs_imap	imap;
 
 		/* We need to search the list for the inode being freed. */
-		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
+		error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
 				&prev_ino, &imap, &last_dip, &last_ibp);
 		if (error)
 			goto out;
 
-		/*
-		 * Now last_ibp points to the buffer previous to us on the
-		 * unlinked list.  Pull us from the list.
-		 */
-		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
-				&next_agino);
-		if (error)
-			goto out;
-
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
 				prev_ino, next_agino);

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 09/10] xfs: add tracepoints for high level iunlink operations
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (7 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 08/10] xfs: refactor inode update in iunlink_remove Darrick J. Wong
@ 2019-02-04 17:59 ` Darrick J. Wong
  2019-02-04 20:59   ` Christoph Hellwig
  2019-02-05 14:24   ` Brian Foster
  2019-02-04 18:00 ` [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 17:59 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add tracepoints so we can associate high level operations with low level
updates.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |    2 ++
 fs/xfs/xfs_trace.h |   25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fac9562ecd39..b9696d762c8f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2023,6 +2023,7 @@ xfs_iunlink(
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
 	pag = xfs_perag_get(mp, agno);
+	trace_xfs_iunlink(ip);
 
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
@@ -2188,6 +2189,7 @@ xfs_iunlink_remove(
 	int			error;
 
 	pag = xfs_perag_get(mp, agno);
+	trace_xfs_iunlink_remove(ip);
 
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fbec8f0e1a9a..a6e384a642b1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3423,6 +3423,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
 		  __entry->new_ptr)
 );
 
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+		__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+	),
+	TP_printk("dev %d:%d agno %u agino %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
                   ` (8 preceding siblings ...)
  2019-02-04 17:59 ` [PATCH 09/10] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
@ 2019-02-04 18:00 ` Darrick J. Wong
  2019-02-04 21:08   ` Christoph Hellwig
  2019-02-05 14:24   ` Brian Foster
  9 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 18:00 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Use a rhashtable to cache the unlinked list incore.  This should speed
up unlinked processing considerably when there are a lot of inodes on
the unlinked list because iunlink_remove no longer has to traverse an
entire bucket list to find which inode points to the one being removed.

The incore list structure records "X.next_unlinked = Y" relations, with
the rhashtable using Y to index the records.  This makes finding the
inode X that points to a inode Y very quick.  If our cache fails to find
anything we can always fall back on the old method.

FWIW this drastically reduces the amount of time it takes to remove
inodes from the unlinked list.  I wrote a program to open a lot of
O_TMPFILE files and then close them in the same order, which takes
a very long time if we have to traverse the unlinked lists.  With the
ptach, I see:

+ /d/t/tmpfile/tmpfile
Opened 193531 files in 6.33s.
Closed 193531 files in 5.86s

real    0m12.192s
user    0m0.064s
sys     0m11.619s
+ cd /
+ umount /mnt

real    0m0.050s
user    0m0.004s
sys     0m0.030s

And without the patch:

+ /d/t/tmpfile/tmpfile
Opened 193588 files in 6.35s.
Closed 193588 files in 751.61s

real    12m38.853s
user    0m0.084s
sys     12m34.470s
+ cd /
+ umount /mnt

real    0m0.086s
user    0m0.000s
sys     0m0.060s

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h       |    9 ++
 fs/xfs/xfs_log_recover.c |   12 ++-
 fs/xfs/xfs_mount.c       |    5 +
 fs/xfs/xfs_mount.h       |    1 
 5 files changed, 233 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b9696d762c8f..baee8c894447 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1880,6 +1880,167 @@ xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * In-Core Unlinked List Lookups
+ * ====================================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+	struct rhash_head	iu_rhash_head;
+	xfs_agino_t		iu_agino;
+	xfs_agino_t		iu_next_unlinked;
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const xfs_agino_t		*key = arg->key;
+	const struct xfs_iunlink	*iu = obj;
+
+	if (iu->iu_next_unlinked != *key)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
+	.nelem_hint		= 512,
+	.key_len		= sizeof(xfs_agino_t),
+	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
+	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
+ */
+xfs_agino_t
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+	iu->iu_agino = prev_agino;
+	iu->iu_next_unlinked = this_agino;
+
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit.
+ */
+int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	/*
+	 * If there is no new next entry just free our item and return.
+	 */
+	if (next_unlinked == NULLAGINO) {
+		kmem_free(iu);
+		return 0;
+	}
+
+	/*
+	 * Else update the hash table to the new entry.
+	 */
+	iu->iu_next_unlinked = next_unlinked;
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+	struct xfs_perag	*pag)
+{
+	return rhashtable_init(&pag->pagi_unlinked_hash,
+			&xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+	void			*ptr,
+	void			*arg)
+{
+	struct xfs_iunlink	*iu = ptr;
+	bool			*freed_anything = arg;
+
+	*freed_anything = true;
+	kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+	struct xfs_perag	*pag)
+{
+	bool			freed_anything = false;
+
+	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+			xfs_iunlink_free_item, &freed_anything);
+
+	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
  * is responsible for validating the old value.
@@ -2055,6 +2216,14 @@ xfs_iunlink(
 		if (error)
 			goto out;
 		ASSERT(old_agino == NULLAGINO);
+
+		/*
+		 * agino has been unlinked, add a backref from the next inode
+		 * back to agino.
+		 */
+		error = xfs_iunlink_add_backref(pag, agino, next_agino);
+		if (error)
+			goto out;
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2127,6 +2296,17 @@ xfs_iunlink_map_prev(
 
 	ASSERT(head_agino != target_agino);
 
+	/* See if our backref cache can find it faster. */
+	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
+	if (next_agino != NULLAGINO) {
+		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
+				&last_ibp);
+		if (error)
+			return error;
+		goto out;
+	}
+
 	next_agino = head_agino;
 	while (next_agino != target_agino) {
 		xfs_agino_t	unlinked_agino;
@@ -2156,6 +2336,7 @@ xfs_iunlink_map_prev(
 		next_agino = unlinked_agino;
 	}
 
+out:
 	/* Should never happen, but don't return garbage. */
 	if (next_ino == NULLFSINO)
 		return -EFSCORRUPTED;
@@ -2218,6 +2399,20 @@ xfs_iunlink_remove(
 	if (error)
 		goto out;
 
+	/*
+	 * If there was a backref pointing from the next inode back to this
+	 * one, remove it because we've removed this inode from the list.
+	 *
+	 * Later, if this inode was in the middle of the list we'll update
+	 * this inode's backref to point from the next inode.
+	 */
+	if (next_agino != NULLAGINO) {
+		error = xfs_iunlink_change_backref(pag, next_agino,
+				NULLAGINO);
+		if (error)
+			goto out;
+	}
+
 	if (head_agino == agino) {
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
@@ -2236,6 +2431,18 @@ xfs_iunlink_remove(
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
 				prev_ino, next_agino);
+
+		/*
+		 * Now we deal with the backref for this inode.  If this inode
+		 * pointed at a real inode, change the backref that pointed to
+		 * us to point to our old next.  If this inode was the end of
+		 * the list, delete the backref that pointed to us.  Note that
+		 * change_backref takes care of deleting the backref if
+		 * next_agino is NULLAGINO.
+		 */
+		error = xfs_iunlink_change_backref(pag, agino, next_agino);
+		if (error)
+			goto out;
 	}
 	pag->pagi_unlinked_count--;
 out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..0a83bfacfd33 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,13 @@ extern struct kmem_zone	*xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+xfs_agino_t xfs_iunlink_lookup_backref(struct xfs_perag *pag,
+		xfs_agino_t agino);
+int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c634fbfea4a8..f5fb8885662f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5078,10 +5078,20 @@ xlog_recover_process_one_iunlink(
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 
-	/* Make sure the in-core data knows about this unlinked inode. */
+	/*
+	 * Make sure the in-core data knows about this unlinked inode.  Since
+	 * our iunlinks recovery basically just deletes the head of a bucket
+	 * list until the bucket is empty, we need only to add the backref from
+	 * the current list item to the next one, if this isn't the list tail.
+	 */
 	pag = xfs_perag_get(mp, agno);
 	pag->pagi_unlinked_count++;
+	if (agino != NULLAGINO)
+		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
+				agino);
 	xfs_perag_put(pag);
+	if (error)
+		goto fail_iput;
 
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6ca92a71c233..9a181f7ca1d5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -151,6 +151,7 @@ xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		ASSERT(pag->pagi_unlinked_count == 0 ||
 		       XFS_FORCED_SHUTDOWN(mp));
+		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -229,6 +230,9 @@ xfs_initialize_perag(
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
+		error = xfs_iunlink_init(pag);
+		if (error)
+			goto out_hash_destroy;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -251,6 +255,7 @@ xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 169069a01f3c..dcc45b441dd6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -391,6 +391,7 @@ typedef struct xfs_perag {
 
 	/* unlinked inode info; lock AGI to access */
 	unsigned int		pagi_unlinked_count;
+	struct rhashtable	pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 01/10] xfs: clean up iunlink functions
  2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
@ 2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:51   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-04 19:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:03AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Fix some indentation issues with the iunlink functions and reorganize
> the tops of the functions to be identical.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |   79 +++++++++++++++++++++-------------------------------
>  1 file changed, 32 insertions(+), 47 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index c8bf02be0003..2367cdfcd90b 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1892,26 +1892,24 @@ xfs_inactive(
>   */
>  STATIC int
>  xfs_iunlink(
> -	struct xfs_trans *tp,
> -	struct xfs_inode *ip)
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip)
>  {
> -	xfs_mount_t	*mp = tp->t_mountp;
> -	xfs_agi_t	*agi;
> -	xfs_dinode_t	*dip;
> -	xfs_buf_t	*agibp;
> -	xfs_buf_t	*ibp;
> -	xfs_agino_t	agino;
> -	short		bucket_index;
> -	int		offset;
> -	int		error;
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_agi		*agi;
> +	struct xfs_dinode	*dip;
> +	struct xfs_buf		*agibp;
> +	struct xfs_buf		*ibp;
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> +	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> +	int			offset;
> +	int			error;
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
>  
> -	/*
> -	 * Get the agi buffer first.  It ensures lock ordering
> -	 * on the list.
> -	 */
> -	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
> +	/* Get the agi buffer first.  It ensures lock ordering on the list. */
> +	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
>  		return error;
>  	agi = XFS_BUF_TO_AGI(agibp);
> @@ -1920,9 +1918,6 @@ xfs_iunlink(
>  	 * Get the index into the agi hash table for the
>  	 * list this inode will go on.
>  	 */
> -	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> -	ASSERT(agino != 0);
> -	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
>  	ASSERT(agi->agi_unlinked[bucket_index]);
>  	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
>  
> @@ -1969,45 +1964,35 @@ xfs_iunlink(
>   */
>  STATIC int
>  xfs_iunlink_remove(
> -	xfs_trans_t	*tp,
> -	xfs_inode_t	*ip)
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip)
>  {
> -	xfs_ino_t	next_ino;
> -	xfs_mount_t	*mp;
> -	xfs_agi_t	*agi;
> -	xfs_dinode_t	*dip;
> -	xfs_buf_t	*agibp;
> -	xfs_buf_t	*ibp;
> -	xfs_agnumber_t	agno;
> -	xfs_agino_t	agino;
> -	xfs_agino_t	next_agino;
> -	xfs_buf_t	*last_ibp;
> -	xfs_dinode_t	*last_dip = NULL;
> -	short		bucket_index;
> -	int		offset, last_offset = 0;
> -	int		error;
> -
> -	mp = tp->t_mountp;
> -	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_agi		*agi;
> +	struct xfs_dinode	*dip;
> +	struct xfs_buf		*agibp;
> +	struct xfs_buf		*ibp;
> +	struct xfs_buf		*last_ibp;
> +	struct xfs_dinode	*last_dip = NULL;
> +	xfs_ino_t		next_ino;
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> +	xfs_agino_t		next_agino;
> +	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> +	int			offset;
> +	int			last_offset = 0;
> +	int			error;
>  
> -	/*
> -	 * Get the agi buffer first.  It ensures lock ordering
> -	 * on the list.
> -	 */
> +	/* Get the agi buffer first.  It ensures lock ordering on the list. */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
>  		return error;
> -
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
>  	 * Get the index into the agi hash table for the
>  	 * list this inode will go on.
>  	 */
> -	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> -	if (!xfs_verify_agino(mp, agno, agino))
> -		return -EFSCORRUPTED;
> -	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
>  	if (!xfs_verify_agino(mp, agno,
>  			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
>  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-04 17:59 ` [PATCH 02/10] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
@ 2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:53     ` Christoph Hellwig
  2019-02-05  0:42     ` Darrick J. Wong
  0 siblings, 2 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-04 19:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:14AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Track the number of unlinked inodes in each AG so that we can use these
> decisions to throttle inactivations when the unlinked list gets long.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

I'm a little confused where we're at with this one given the discussion
on the previous version. We've dropped the locking, but left the
tracking in place. Are we just relying on holding the agi in the
iunlink/iunlink_remove cases? If so, that seems reasonable to me but the
commit log should probably have a sentence or two on the serialization
rules. The commit log could also use an update to describe how this
value is actually used in this patch (as an unmount time check) as
opposed to some apparent throttling functionality that isn't a part of
this series.

Brian

>  fs/xfs/xfs_inode.c       |   35 ++++++++++++++++++++++++-----------
>  fs/xfs/xfs_log_recover.c |    6 ++++++
>  fs/xfs/xfs_mount.c       |    2 ++
>  fs/xfs/xfs_mount.h       |    3 +++
>  4 files changed, 35 insertions(+), 11 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 2367cdfcd90b..435901c7c674 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1900,6 +1900,7 @@ xfs_iunlink(
>  	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
>  	struct xfs_buf		*ibp;
> +	struct xfs_perag	*pag;
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> @@ -1907,11 +1908,12 @@ xfs_iunlink(
>  	int			error;
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
> +	pag = xfs_perag_get(mp, agno);
>  
>  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
> -		return error;
> +		goto out;
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> @@ -1931,7 +1933,7 @@ xfs_iunlink(
>  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
>  				       0, 0);
>  		if (error)
> -			return error;
> +			goto out;
>  
>  		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
>  		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> @@ -1956,7 +1958,10 @@ xfs_iunlink(
>  		(sizeof(xfs_agino_t) * bucket_index);
>  	xfs_trans_log_buf(tp, agibp, offset,
>  			  (offset + sizeof(xfs_agino_t) - 1));
> -	return 0;
> +	pag->pagi_unlinked_count++;
> +out:
> +	xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> @@ -1974,6 +1979,7 @@ xfs_iunlink_remove(
>  	struct xfs_buf		*ibp;
>  	struct xfs_buf		*last_ibp;
>  	struct xfs_dinode	*last_dip = NULL;
> +	struct xfs_perag	*pag;
>  	xfs_ino_t		next_ino;
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> @@ -1983,10 +1989,12 @@ xfs_iunlink_remove(
>  	int			last_offset = 0;
>  	int			error;
>  
> +	pag = xfs_perag_get(mp, agno);
> +
>  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
> -		return error;
> +		goto out;
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> @@ -1997,7 +2005,8 @@ xfs_iunlink_remove(
>  			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
>  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
>  				agi, sizeof(*agi));
> -		return -EFSCORRUPTED;
> +		error = -EFSCORRUPTED;
> +		goto out;
>  	}
>  
>  	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
> @@ -2013,7 +2022,7 @@ xfs_iunlink_remove(
>  		if (error) {
>  			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
>  				__func__, error);
> -			return error;
> +			goto out;
>  		}
>  		next_agino = be32_to_cpu(dip->di_next_unlinked);
>  		ASSERT(next_agino != 0);
> @@ -2062,7 +2071,7 @@ xfs_iunlink_remove(
>  				xfs_warn(mp,
>  	"%s: xfs_imap returned error %d.",
>  					 __func__, error);
> -				return error;
> +				goto out;
>  			}
>  
>  			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> @@ -2071,7 +2080,7 @@ xfs_iunlink_remove(
>  				xfs_warn(mp,
>  	"%s: xfs_imap_to_bp returned error %d.",
>  					__func__, error);
> -				return error;
> +				goto out;
>  			}
>  
>  			last_offset = imap.im_boffset;
> @@ -2080,7 +2089,8 @@ xfs_iunlink_remove(
>  				XFS_CORRUPTION_ERROR(__func__,
>  						XFS_ERRLEVEL_LOW, mp,
>  						last_dip, sizeof(*last_dip));
> -				return -EFSCORRUPTED;
> +				error = -EFSCORRUPTED;
> +				goto out;
>  			}
>  		}
>  
> @@ -2093,7 +2103,7 @@ xfs_iunlink_remove(
>  		if (error) {
>  			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
>  				__func__, error);
> -			return error;
> +			goto out;
>  		}
>  		next_agino = be32_to_cpu(dip->di_next_unlinked);
>  		ASSERT(next_agino != 0);
> @@ -2128,7 +2138,10 @@ xfs_iunlink_remove(
>  				  (offset + sizeof(xfs_agino_t) - 1));
>  		xfs_inobp_check(mp, last_ibp);
>  	}
> -	return 0;
> +	pag->pagi_unlinked_count--;
> +out:
> +	xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index ff9a27834c50..c634fbfea4a8 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
>  	struct xfs_buf			*ibp;
>  	struct xfs_dinode		*dip;
>  	struct xfs_inode		*ip;
> +	struct xfs_perag		*pag;
>  	xfs_ino_t			ino;
>  	int				error;
>  
> @@ -5077,6 +5078,11 @@ xlog_recover_process_one_iunlink(
>  	agino = be32_to_cpu(dip->di_next_unlinked);
>  	xfs_buf_relse(ibp);
>  
> +	/* Make sure the in-core data knows about this unlinked inode. */
> +	pag = xfs_perag_get(mp, agno);
> +	pag->pagi_unlinked_count++;
> +	xfs_perag_put(pag);
> +
>  	/*
>  	 * Prevent any DMAPI event from being sent when the reference on
>  	 * the inode is dropped.
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 10be706ec72e..6ca92a71c233 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -149,6 +149,8 @@ xfs_free_perag(
>  		spin_unlock(&mp->m_perag_lock);
>  		ASSERT(pag);
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> +		ASSERT(pag->pagi_unlinked_count == 0 ||
> +		       XFS_FORCED_SHUTDOWN(mp));
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index e344b1dfde63..169069a01f3c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -388,6 +388,9 @@ typedef struct xfs_perag {
>  
>  	/* reference count */
>  	uint8_t			pagf_refcount_level;
> +
> +	/* unlinked inode info; lock AGI to access */
> +	unsigned int		pagi_unlinked_count;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper
  2019-02-04 17:59 ` [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper Darrick J. Wong
@ 2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:53   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-04 19:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:21AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add a new helper to check that a per-AG inode pointer is either null or
> points somewhere valid within that AG.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_inode_buf.c |    3 +--
>  fs/xfs/libxfs/xfs_types.c     |   13 +++++++++++++
>  fs/xfs/libxfs/xfs_types.h     |    2 ++
>  fs/xfs/scrub/agheader.c       |    8 +++-----
>  4 files changed, 19 insertions(+), 7 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 8fa1050c1ae2..77e412a4b6ea 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -99,8 +99,7 @@ xfs_inode_buf_verify(
>  		unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
>  		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
>  			xfs_dinode_good_version(mp, dip->di_version) &&
> -			(unlinked_ino == NULLAGINO ||
> -			 xfs_verify_agino(mp, agno, unlinked_ino));
> +			xfs_verify_agino_or_null(mp, agno, unlinked_ino);
>  		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
>  						XFS_ERRTAG_ITOBP_INOTOBP))) {
>  			if (readahead) {
> diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
> index 0e35b4bdfef7..5b37c99b2811 100644
> --- a/fs/xfs/libxfs/xfs_types.c
> +++ b/fs/xfs/libxfs/xfs_types.c
> @@ -115,6 +115,19 @@ xfs_verify_agino(
>  	return agino >= first && agino <= last;
>  }
>  
> +/*
> + * Verify that an AG inode number pointer neither points outside the AG
> + * nor points at static metadata, or is NULLAGINO.
> + */
> +bool
> +xfs_verify_agino_or_null(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_agino_t		agino)
> +{
> +	return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
> +}
> +
>  /*
>   * Verify that an FS inode number pointer neither points outside the
>   * filesystem nor points at static AG metadata.
> diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> index 704b4f308780..c5a25403b4db 100644
> --- a/fs/xfs/libxfs/xfs_types.h
> +++ b/fs/xfs/libxfs/xfs_types.h
> @@ -183,6 +183,8 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
>  		xfs_agino_t *first, xfs_agino_t *last);
>  bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
>  		xfs_agino_t agino);
> +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		xfs_agino_t agino);
>  bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
>  bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
>  bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
> diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
> index 90955ab1e895..9d4e8293d37e 100644
> --- a/fs/xfs/scrub/agheader.c
> +++ b/fs/xfs/scrub/agheader.c
> @@ -864,19 +864,17 @@ xchk_agi(
>  
>  	/* Check inode pointers */
>  	agino = be32_to_cpu(agi->agi_newino);
> -	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
> +	if (!xfs_verify_agino_or_null(mp, agno, agino))
>  		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
>  
>  	agino = be32_to_cpu(agi->agi_dirino);
> -	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
> +	if (!xfs_verify_agino_or_null(mp, agno, agino))
>  		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
>  
>  	/* Check unlinked inode buckets */
>  	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
>  		agino = be32_to_cpu(agi->agi_unlinked[i]);
> -		if (agino == NULLAGINO)
> -			continue;
> -		if (!xfs_verify_agino(mp, agno, agino))
> +		if (!xfs_verify_agino_or_null(mp, agno, agino))
>  			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
>  	}
>  
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 04/10] xfs: refactor AGI unlinked bucket updates
  2019-02-04 17:59 ` [PATCH 04/10] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
@ 2019-02-04 19:25   ` Brian Foster
  2019-02-04 20:54   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-04 19:25 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:27AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Split the AGI unlinked bucket updates into a separate function.  No
> functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |   66 +++++++++++++++++++++++++++++++++++++---------------
>  fs/xfs/xfs_trace.h |   26 ++++++++++++++++++++
>  2 files changed, 73 insertions(+), 19 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 435901c7c674..8c4a1d198de8 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1880,6 +1880,43 @@ xfs_inactive(
>  	xfs_qm_dqdetach(ip);
>  }
>  
> +/*
> + * Point the AGI unlinked bucket at an inode and log the results.  The caller
> + * is responsible for validating the old value.
> + */
> +STATIC int
> +xfs_iunlink_update_bucket(
> +	struct xfs_trans	*tp,
> +	xfs_agnumber_t		agno,
> +	struct xfs_buf		*agibp,
> +	unsigned int		bucket_index,
> +	xfs_agino_t		new_agino)
> +{
> +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
> +	xfs_agino_t		old_value;
> +	int			offset;
> +
> +	ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
> +
> +	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
> +			old_value, new_agino);
> +
> +	/*
> +	 * We should never find the head of the list already set to the value
> +	 * passed in because either we're adding or removing ourselves from the
> +	 * head of the list.
> +	 */
> +	if (old_value == new_agino)
> +		return -EFSCORRUPTED;
> +
> +	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
> +	offset = offsetof(struct xfs_agi, agi_unlinked) +
> +			(sizeof(xfs_agino_t) * bucket_index);
> +	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
> +	return 0;
> +}
> +
>  /*
>   * This is called when the inode's link count goes to 0 or we are creating a
>   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> @@ -1949,15 +1986,10 @@ xfs_iunlink(
>  		xfs_inobp_check(mp, ibp);
>  	}
>  
> -	/*
> -	 * Point the bucket head pointer at the inode being inserted.
> -	 */
> -	ASSERT(agino != 0);
> -	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
> -	offset = offsetof(xfs_agi_t, agi_unlinked) +
> -		(sizeof(xfs_agino_t) * bucket_index);
> -	xfs_trans_log_buf(tp, agibp, offset,
> -			  (offset + sizeof(xfs_agino_t) - 1));
> +	/* Point the head of the list to point to this inode. */
> +	error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
> +	if (error)
> +		goto out;
>  	pag->pagi_unlinked_count++;
>  out:
>  	xfs_perag_put(pag);
> @@ -2041,16 +2073,12 @@ xfs_iunlink_remove(
>  		} else {
>  			xfs_trans_brelse(tp, ibp);
>  		}
> -		/*
> -		 * Point the bucket head pointer at the next inode.
> -		 */
> -		ASSERT(next_agino != 0);
> -		ASSERT(next_agino != agino);
> -		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
> -		offset = offsetof(xfs_agi_t, agi_unlinked) +
> -			(sizeof(xfs_agino_t) * bucket_index);
> -		xfs_trans_log_buf(tp, agibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> +
> +		/* Point the head of the list to the next unlinked inode. */
> +		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> +				next_agino);
> +		if (error)
> +			goto out;
>  	} else {
>  		/*
>  		 * We need to search the list for the inode being freed.
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 6fcc893dfc91..c10478e7e49a 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
>  DEFINE_TRANS_EVENT(xfs_trans_add_item);
>  DEFINE_TRANS_EVENT(xfs_trans_free_items);
>  
> +TRACE_EVENT(xfs_iunlink_update_bucket,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
> +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> +	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(unsigned int, bucket)
> +		__field(xfs_agino_t, old_ptr)
> +		__field(xfs_agino_t, new_ptr)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->bucket = bucket;
> +		__entry->old_ptr = old_ptr;
> +		__entry->new_ptr = new_ptr;
> +	),
> +	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno,
> +		  __entry->bucket,
> +		  __entry->old_ptr,
> +		  __entry->new_ptr)
> +);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 01/10] xfs: clean up iunlink functions
  2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
  2019-02-04 19:24   ` Brian Foster
@ 2019-02-04 20:51   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:51 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:03AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Fix some indentation issues with the iunlink functions and reorganize
> the tops of the functions to be identical.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-04 19:24   ` Brian Foster
@ 2019-02-04 20:53     ` Christoph Hellwig
  2019-02-05  0:26       ` Darrick J. Wong
  2019-02-05  0:42     ` Darrick J. Wong
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:53 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, linux-xfs

On Mon, Feb 04, 2019 at 02:24:51PM -0500, Brian Foster wrote:
> I'm a little confused where we're at with this one given the discussion
> on the previous version. We've dropped the locking, but left the
> tracking in place. Are we just relying on holding the agi in the
> iunlink/iunlink_remove cases? If so, that seems reasonable to me but the
> commit log should probably have a sentence or two on the serialization
> rules. The commit log could also use an update to describe how this
> value is actually used in this patch (as an unmount time check) as
> opposed to some apparent throttling functionality that isn't a part of
> this series.

I thought we Darrick was going to drop this tracking from the series,
as it isn't very useful (at least yet), but maybe I misunderstood the
previous thread.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper
  2019-02-04 17:59 ` [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper Darrick J. Wong
  2019-02-04 19:24   ` Brian Foster
@ 2019-02-04 20:53   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:53 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:21AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add a new helper to check that a per-AG inode pointer is either null or
> points somewhere valid within that AG.

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 04/10] xfs: refactor AGI unlinked bucket updates
  2019-02-04 17:59 ` [PATCH 04/10] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
  2019-02-04 19:25   ` Brian Foster
@ 2019-02-04 20:54   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:54 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:27AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Split the AGI unlinked bucket updates into a separate function.  No
> functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 06/10] xfs: refactor inode unlinked pointer update functions
  2019-02-04 17:59 ` [PATCH 06/10] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
@ 2019-02-04 20:56   ` Christoph Hellwig
  2019-02-04 21:49     ` Darrick J. Wong
  2019-02-05 14:23   ` Brian Foster
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:56 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +	/* Make sure the old pointer isn't garbage. */
> +	old_value = be32_to_cpu(dip->di_next_unlinked);
> +	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {

Couldn't this use xfs_verify_agino_or_null?

Otherwise looks fine:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function
  2019-02-04 17:59 ` [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function Darrick J. Wong
@ 2019-02-04 20:58   ` Christoph Hellwig
  2019-02-04 22:23     ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:58 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> +				&last_ibp);

Last round I suggested pasing the agno/agino to xfs_iunlink_map_ino,
and good reason for not doing that?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 08/10] xfs: refactor inode update in iunlink_remove
  2019-02-04 17:59 ` [PATCH 08/10] xfs: refactor inode update in iunlink_remove Darrick J. Wong
@ 2019-02-04 20:58   ` Christoph Hellwig
  2019-02-05 14:23   ` Brian Foster
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:58 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:52AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> In xfs_iunlink_remove we have two identical calls to
> xfs_iunlink_update_inode, so move it out of the if statement to simplify
> the code some more.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 09/10] xfs: add tracepoints for high level iunlink operations
  2019-02-04 17:59 ` [PATCH 09/10] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
@ 2019-02-04 20:59   ` Christoph Hellwig
  2019-02-05 14:24   ` Brian Foster
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 20:59 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:58AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add tracepoints so we can associate high level operations with low level
> updates.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-04 18:00 ` [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
@ 2019-02-04 21:08   ` Christoph Hellwig
  2019-02-05  1:02     ` Darrick J. Wong
  2019-02-05 14:24   ` Brian Foster
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-04 21:08 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +xfs_agino_t xfs_iunlink_lookup_backref(struct xfs_perag *pag,
> +		xfs_agino_t agino);
> +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);

xfs_iunlink_lookup_backref and xfs_iunlink_change_backref aren't
used outside of xfs_inode.c and should be marked static.

> +	/*
> +	 * Make sure the in-core data knows about this unlinked inode.  Since
> +	 * our iunlinks recovery basically just deletes the head of a bucket
> +	 * list until the bucket is empty, we need only to add the backref from
> +	 * the current list item to the next one, if this isn't the list tail.
> +	 */
>  	pag = xfs_perag_get(mp, agno);
>  	pag->pagi_unlinked_count++;
> +	if (agino != NULLAGINO)
> +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> +				agino);
>  	xfs_perag_put(pag);
> +	if (error)
> +		goto fail_iput;

Note that the previos agino that we recaculate above is actually passed
to the function as an argument.  I think we should just add a new
next_agino variable for the one we read from the dinode and return and
reuse the argument here instead of recaculating it.

Question: what lock now protects the rhastable modifications?  Maybe
we need to add some lockdep asserts to document that.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 06/10] xfs: refactor inode unlinked pointer update functions
  2019-02-04 20:56   ` Christoph Hellwig
@ 2019-02-04 21:49     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 21:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 12:56:28PM -0800, Christoph Hellwig wrote:
> > +	/* Make sure the old pointer isn't garbage. */
> > +	old_value = be32_to_cpu(dip->di_next_unlinked);
> > +	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
> 
> Couldn't this use xfs_verify_agino_or_null?

Oops, missed that.  Will fix.

--D

> Otherwise looks fine:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function
  2019-02-04 20:58   ` Christoph Hellwig
@ 2019-02-04 22:23     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 22:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 12:58:28PM -0800, Christoph Hellwig wrote:
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> > +				&last_ibp);
> 
> Last round I suggested pasing the agno/agino to xfs_iunlink_map_ino,
> and good reason for not doing that?

I forgot to do it. :(

Looking through the code, I could also change the _update_dinode
function to take xfs_agino_t instead of xfs_ino_t, though even that is
nearly pointless since we only do it to feed a tracepoint.

Anyway, I'll incorporate that into v3.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-04 20:53     ` Christoph Hellwig
@ 2019-02-05  0:26       ` Darrick J. Wong
  2019-02-05  6:55         ` Christoph Hellwig
  0 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05  0:26 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Brian Foster, linux-xfs

On Mon, Feb 04, 2019 at 12:53:10PM -0800, Christoph Hellwig wrote:
> On Mon, Feb 04, 2019 at 02:24:51PM -0500, Brian Foster wrote:
> > I'm a little confused where we're at with this one given the discussion
> > on the previous version. We've dropped the locking, but left the
> > tracking in place. Are we just relying on holding the agi in the
> > iunlink/iunlink_remove cases? If so, that seems reasonable to me but the
> > commit log should probably have a sentence or two on the serialization
> > rules. The commit log could also use an update to describe how this
> > value is actually used in this patch (as an unmount time check) as
> > opposed to some apparent throttling functionality that isn't a part of
> > this series.
> 
> I thought we Darrick was going to drop this tracking from the series,
> as it isn't very useful (at least yet), but maybe I misunderstood the
> previous thread.

I decided to leave the unlinked counter so that we could have a
debugging check.  I will make it more explicit that anyone accessing the
counter needs to hold the AGI buffer lock or otherwise assured that
there aren't any other threads.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-04 19:24   ` Brian Foster
  2019-02-04 20:53     ` Christoph Hellwig
@ 2019-02-05  0:42     ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05  0:42 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 02:24:51PM -0500, Brian Foster wrote:
> On Mon, Feb 04, 2019 at 09:59:14AM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Track the number of unlinked inodes in each AG so that we can use these
> > decisions to throttle inactivations when the unlinked list gets long.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> I'm a little confused where we're at with this one given the discussion
> on the previous version. We've dropped the locking, but left the
> tracking in place. Are we just relying on holding the agi in the
> iunlink/iunlink_remove cases?

Yes.

> If so, that seems reasonable to me but the
> commit log should probably have a sentence or two on the serialization
> rules. The commit log could also use an update to describe how this
> value is actually used in this patch (as an unmount time check) as
> opposed to some apparent throttling functionality that isn't a part of
> this series.

I will improve the comments in the structure definition and update the
commit message.

--D

> Brian
> 
> >  fs/xfs/xfs_inode.c       |   35 ++++++++++++++++++++++++-----------
> >  fs/xfs/xfs_log_recover.c |    6 ++++++
> >  fs/xfs/xfs_mount.c       |    2 ++
> >  fs/xfs/xfs_mount.h       |    3 +++
> >  4 files changed, 35 insertions(+), 11 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 2367cdfcd90b..435901c7c674 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1900,6 +1900,7 @@ xfs_iunlink(
> >  	struct xfs_dinode	*dip;
> >  	struct xfs_buf		*agibp;
> >  	struct xfs_buf		*ibp;
> > +	struct xfs_perag	*pag;
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> >  	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> > @@ -1907,11 +1908,12 @@ xfs_iunlink(
> >  	int			error;
> >  
> >  	ASSERT(VFS_I(ip)->i_mode != 0);
> > +	pag = xfs_perag_get(mp, agno);
> >  
> >  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
> >  	error = xfs_read_agi(mp, tp, agno, &agibp);
> >  	if (error)
> > -		return error;
> > +		goto out;
> >  	agi = XFS_BUF_TO_AGI(agibp);
> >  
> >  	/*
> > @@ -1931,7 +1933,7 @@ xfs_iunlink(
> >  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> >  				       0, 0);
> >  		if (error)
> > -			return error;
> > +			goto out;
> >  
> >  		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
> >  		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> > @@ -1956,7 +1958,10 @@ xfs_iunlink(
> >  		(sizeof(xfs_agino_t) * bucket_index);
> >  	xfs_trans_log_buf(tp, agibp, offset,
> >  			  (offset + sizeof(xfs_agino_t) - 1));
> > -	return 0;
> > +	pag->pagi_unlinked_count++;
> > +out:
> > +	xfs_perag_put(pag);
> > +	return error;
> >  }
> >  
> >  /*
> > @@ -1974,6 +1979,7 @@ xfs_iunlink_remove(
> >  	struct xfs_buf		*ibp;
> >  	struct xfs_buf		*last_ibp;
> >  	struct xfs_dinode	*last_dip = NULL;
> > +	struct xfs_perag	*pag;
> >  	xfs_ino_t		next_ino;
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> > @@ -1983,10 +1989,12 @@ xfs_iunlink_remove(
> >  	int			last_offset = 0;
> >  	int			error;
> >  
> > +	pag = xfs_perag_get(mp, agno);
> > +
> >  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
> >  	error = xfs_read_agi(mp, tp, agno, &agibp);
> >  	if (error)
> > -		return error;
> > +		goto out;
> >  	agi = XFS_BUF_TO_AGI(agibp);
> >  
> >  	/*
> > @@ -1997,7 +2005,8 @@ xfs_iunlink_remove(
> >  			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
> >  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> >  				agi, sizeof(*agi));
> > -		return -EFSCORRUPTED;
> > +		error = -EFSCORRUPTED;
> > +		goto out;
> >  	}
> >  
> >  	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
> > @@ -2013,7 +2022,7 @@ xfs_iunlink_remove(
> >  		if (error) {
> >  			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> >  				__func__, error);
> > -			return error;
> > +			goto out;
> >  		}
> >  		next_agino = be32_to_cpu(dip->di_next_unlinked);
> >  		ASSERT(next_agino != 0);
> > @@ -2062,7 +2071,7 @@ xfs_iunlink_remove(
> >  				xfs_warn(mp,
> >  	"%s: xfs_imap returned error %d.",
> >  					 __func__, error);
> > -				return error;
> > +				goto out;
> >  			}
> >  
> >  			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> > @@ -2071,7 +2080,7 @@ xfs_iunlink_remove(
> >  				xfs_warn(mp,
> >  	"%s: xfs_imap_to_bp returned error %d.",
> >  					__func__, error);
> > -				return error;
> > +				goto out;
> >  			}
> >  
> >  			last_offset = imap.im_boffset;
> > @@ -2080,7 +2089,8 @@ xfs_iunlink_remove(
> >  				XFS_CORRUPTION_ERROR(__func__,
> >  						XFS_ERRLEVEL_LOW, mp,
> >  						last_dip, sizeof(*last_dip));
> > -				return -EFSCORRUPTED;
> > +				error = -EFSCORRUPTED;
> > +				goto out;
> >  			}
> >  		}
> >  
> > @@ -2093,7 +2103,7 @@ xfs_iunlink_remove(
> >  		if (error) {
> >  			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
> >  				__func__, error);
> > -			return error;
> > +			goto out;
> >  		}
> >  		next_agino = be32_to_cpu(dip->di_next_unlinked);
> >  		ASSERT(next_agino != 0);
> > @@ -2128,7 +2138,10 @@ xfs_iunlink_remove(
> >  				  (offset + sizeof(xfs_agino_t) - 1));
> >  		xfs_inobp_check(mp, last_ibp);
> >  	}
> > -	return 0;
> > +	pag->pagi_unlinked_count--;
> > +out:
> > +	xfs_perag_put(pag);
> > +	return error;
> >  }
> >  
> >  /*
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index ff9a27834c50..c634fbfea4a8 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
> >  	struct xfs_buf			*ibp;
> >  	struct xfs_dinode		*dip;
> >  	struct xfs_inode		*ip;
> > +	struct xfs_perag		*pag;
> >  	xfs_ino_t			ino;
> >  	int				error;
> >  
> > @@ -5077,6 +5078,11 @@ xlog_recover_process_one_iunlink(
> >  	agino = be32_to_cpu(dip->di_next_unlinked);
> >  	xfs_buf_relse(ibp);
> >  
> > +	/* Make sure the in-core data knows about this unlinked inode. */
> > +	pag = xfs_perag_get(mp, agno);
> > +	pag->pagi_unlinked_count++;
> > +	xfs_perag_put(pag);
> > +
> >  	/*
> >  	 * Prevent any DMAPI event from being sent when the reference on
> >  	 * the inode is dropped.
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 10be706ec72e..6ca92a71c233 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -149,6 +149,8 @@ xfs_free_perag(
> >  		spin_unlock(&mp->m_perag_lock);
> >  		ASSERT(pag);
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> > +		ASSERT(pag->pagi_unlinked_count == 0 ||
> > +		       XFS_FORCED_SHUTDOWN(mp));
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index e344b1dfde63..169069a01f3c 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -388,6 +388,9 @@ typedef struct xfs_perag {
> >  
> >  	/* reference count */
> >  	uint8_t			pagf_refcount_level;
> > +
> > +	/* unlinked inode info; lock AGI to access */
> > +	unsigned int		pagi_unlinked_count;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-04 21:08   ` Christoph Hellwig
@ 2019-02-05  1:02     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05  1:02 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 01:08:48PM -0800, Christoph Hellwig wrote:
> > +int xfs_iunlink_init(struct xfs_perag *pag);
> > +void xfs_iunlink_destroy(struct xfs_perag *pag);
> > +xfs_agino_t xfs_iunlink_lookup_backref(struct xfs_perag *pag,
> > +		xfs_agino_t agino);
> > +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> > +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> 
> xfs_iunlink_lookup_backref and xfs_iunlink_change_backref aren't
> used outside of xfs_inode.c and should be marked static.

Fixed.

> > +	/*
> > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > +	 * list until the bucket is empty, we need only to add the backref from
> > +	 * the current list item to the next one, if this isn't the list tail.
> > +	 */
> >  	pag = xfs_perag_get(mp, agno);
> >  	pag->pagi_unlinked_count++;
> > +	if (agino != NULLAGINO)
> > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > +				agino);
> >  	xfs_perag_put(pag);
> > +	if (error)
> > +		goto fail_iput;
> 
> Note that the previos agino that we recaculate above is actually passed
> to the function as an argument.  I think we should just add a new
> next_agino variable for the one we read from the dinode and return and
> reuse the argument here instead of recaculating it.

Ok, I'll change the variable names.

> Question: what lock now protects the rhastable modifications?  Maybe
> we need to add some lockdep asserts to document that.

Callers are expected to hold the AGI buffer lock to serialize accesses
to the incore unlinked data.  That includes pagi_unlinked_count and
the new rhashtable.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-05  0:26       ` Darrick J. Wong
@ 2019-02-05  6:55         ` Christoph Hellwig
  2019-02-05  7:07           ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-05  6:55 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Brian Foster, linux-xfs

On Mon, Feb 04, 2019 at 04:26:38PM -0800, Darrick J. Wong wrote:
> I decided to leave the unlinked counter so that we could have a
> debugging check.  I will make it more explicit that anyone accessing the
> counter needs to hold the AGI buffer lock or otherwise assured that
> there aren't any other threads.

Don'we already have a debug check by the fact that the rhashtable
destroy function throws an assert if it finds anything?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/10] xfs: track unlinked inode counts in per-ag data
  2019-02-05  6:55         ` Christoph Hellwig
@ 2019-02-05  7:07           ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05  7:07 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Brian Foster, linux-xfs

On Mon, Feb 04, 2019 at 10:55:18PM -0800, Christoph Hellwig wrote:
> On Mon, Feb 04, 2019 at 04:26:38PM -0800, Darrick J. Wong wrote:
> > I decided to leave the unlinked counter so that we could have a
> > debugging check.  I will make it more explicit that anyone accessing the
> > counter needs to hold the AGI buffer lock or otherwise assured that
> > there aren't any other threads.
> 
> Don'we already have a debug check by the fact that the rhashtable
> destroy function throws an assert if it finds anything?

Yes.  If we're maintaining the backrefs correctly then we should never
have any left over and presumably we shouldn't ever need to fall back to
the bucket walk.

Ok, dropped.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 06/10] xfs: refactor inode unlinked pointer update functions
  2019-02-04 17:59 ` [PATCH 06/10] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
  2019-02-04 20:56   ` Christoph Hellwig
@ 2019-02-05 14:23   ` Brian Foster
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-05 14:23 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:40AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Hoist the functions that update an inode's unlinked pointer updates into
> a helper.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Looks fine modulo Christoph's comment:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |  191 +++++++++++++++++++++++++++-------------------------
>  fs/xfs/xfs_trace.h |   26 +++++++
>  2 files changed, 124 insertions(+), 93 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 9c17ae95b18f..f23f13f0f2e6 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1917,6 +1917,85 @@ xfs_iunlink_update_bucket(
>  	return 0;
>  }
>  
> +/* Set an on-disk inode's next_unlinked pointer. */
> +STATIC void
> +xfs_iunlink_update_dinode(
> +	struct xfs_trans	*tp,
> +	xfs_agnumber_t		agno,
> +	struct xfs_buf		*ibp,
> +	struct xfs_dinode	*dip,
> +	struct xfs_imap		*imap,
> +	xfs_ino_t		ino,
> +	xfs_agino_t		next_agino)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	int			offset;
> +
> +	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
> +
> +	trace_xfs_iunlink_update_dinode(mp, agno, XFS_INO_TO_AGINO(mp, ino),
> +			be32_to_cpu(dip->di_next_unlinked), next_agino);
> +
> +	dip->di_next_unlinked = cpu_to_be32(next_agino);
> +	offset = imap->im_boffset +
> +			offsetof(struct xfs_dinode, di_next_unlinked);
> +
> +	/* need to recalc the inode CRC if appropriate */
> +	xfs_dinode_calc_crc(mp, dip);
> +	xfs_trans_inode_buf(tp, ibp);
> +	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
> +	xfs_inobp_check(mp, ibp);
> +}
> +
> +/* Set an in-core inode's unlinked pointer and return the old value. */
> +STATIC int
> +xfs_iunlink_update_inode(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	xfs_agnumber_t		agno,
> +	xfs_agino_t		next_agino,
> +	xfs_agino_t		*old_next_agino)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_dinode	*dip;
> +	struct xfs_buf		*ibp;
> +	xfs_agino_t		old_value;
> +	int			error;
> +
> +	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
> +
> +	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
> +	if (error)
> +		return error;
> +
> +	/* Make sure the old pointer isn't garbage. */
> +	old_value = be32_to_cpu(dip->di_next_unlinked);
> +	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
> +		error = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Since we're updating a linked list, we should never find that the
> +	 * current pointer is the same as the new value, unless we're
> +	 * terminating the list.
> +	 */
> +	*old_next_agino = old_value;
> +	if (old_value == next_agino) {
> +		if (next_agino != NULLAGINO)
> +			error = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	/* Ok, update the new pointer. */
> +	xfs_iunlink_update_dinode(tp, agno, ibp, dip, &ip->i_imap, ip->i_ino,
> +			next_agino);
> +	return 0;
> +out:
> +	xfs_trans_brelse(tp, ibp);
> +	return error;
> +}
> +
>  /*
>   * This is called when the inode's link count goes to 0 or we are creating a
>   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> @@ -1934,15 +2013,12 @@ xfs_iunlink(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> -	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
> -	struct xfs_buf		*ibp;
>  	struct xfs_perag	*pag;
>  	xfs_agino_t		next_agino;
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> -	int			offset;
>  	int			error;
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
> @@ -1963,33 +2039,21 @@ xfs_iunlink(
>  	if (next_agino == agino ||
>  	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
>  		error = -EFSCORRUPTED;
> -		return error;
> +		goto out;
>  	}
>  
>  	if (next_agino != NULLAGINO) {
> +		xfs_agino_t	old_agino;
> +
>  		/*
> -		 * There is already another inode in the bucket we need
> -		 * to add ourselves to.  Add us at the front of the list.
> -		 * Here we put the head pointer into our next pointer,
> -		 * and then we fall through to point the head at us.
> +		 * There is already another inode in the bucket, so point this
> +		 * inode to the current head of the list.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> +		error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
> +				&old_agino);
>  		if (error)
>  			goto out;
> -
> -		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
> -		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> -		offset = ip->i_imap.im_boffset +
> -			offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -		/* need to recalc the inode CRC if appropriate */
> -		xfs_dinode_calc_crc(mp, dip);
> -
> -		xfs_trans_inode_buf(tp, ibp);
> -		xfs_trans_log_buf(tp, ibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> -		xfs_inobp_check(mp, ibp);
> +		ASSERT(old_agino == NULLAGINO);
>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2012,9 +2076,7 @@ xfs_iunlink_remove(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> -	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
> -	struct xfs_buf		*ibp;
>  	struct xfs_buf		*last_ibp;
>  	struct xfs_dinode	*last_dip = NULL;
>  	struct xfs_perag	*pag;
> @@ -2023,8 +2085,6 @@ xfs_iunlink_remove(
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	xfs_agino_t		next_agino;
>  	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> -	int			offset;
> -	int			last_offset = 0;
>  	int			error;
>  
>  	pag = xfs_perag_get(mp, agno);
> @@ -2051,34 +2111,11 @@ xfs_iunlink_remove(
>  		/*
>  		 * We're at the head of the list.  Get the inode's on-disk
>  		 * buffer to see if there is anyone after us on the list.
> -		 * Only modify our next pointer if it is not already NULLAGINO.
> -		 * This saves us the overhead of dealing with the buffer when
> -		 * there is no need to change it.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> -		if (error) {
> -			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> -				__func__, error);
> +		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
> +				&next_agino);
> +		if (error)
>  			goto out;
> -		}
> -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> -		ASSERT(next_agino != 0);
> -		if (next_agino != NULLAGINO) {
> -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> -			offset = ip->i_imap.im_boffset +
> -				offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -			/* need to recalc the inode CRC if appropriate */
> -			xfs_dinode_calc_crc(mp, dip);
> -
> -			xfs_trans_inode_buf(tp, ibp);
> -			xfs_trans_log_buf(tp, ibp, offset,
> -					  (offset + sizeof(xfs_agino_t) - 1));
> -			xfs_inobp_check(mp, ibp);
> -		} else {
> -			xfs_trans_brelse(tp, ibp);
> -		}
>  
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> @@ -2086,13 +2123,13 @@ xfs_iunlink_remove(
>  		if (error)
>  			goto out;
>  	} else {
> +		struct xfs_imap	imap;
> +
>  		/*
>  		 * We need to search the list for the inode being freed.
>  		 */
>  		last_ibp = NULL;
>  		while (next_agino != agino) {
> -			struct xfs_imap	imap;
> -
>  			if (last_ibp)
>  				xfs_trans_brelse(tp, last_ibp);
>  
> @@ -2116,7 +2153,6 @@ xfs_iunlink_remove(
>  				goto out;
>  			}
>  
> -			last_offset = imap.im_boffset;
>  			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
>  			if (!xfs_verify_agino(mp, agno, next_agino)) {
>  				XFS_CORRUPTION_ERROR(__func__,
> @@ -2131,45 +2167,14 @@ xfs_iunlink_remove(
>  		 * Now last_ibp points to the buffer previous to us on the
>  		 * unlinked list.  Pull us from the list.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> -		if (error) {
> -			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
> -				__func__, error);
> +		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
> +				&next_agino);
> +		if (error)
>  			goto out;
> -		}
> -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> -		ASSERT(next_agino != 0);
> -		ASSERT(next_agino != agino);
> -		if (next_agino != NULLAGINO) {
> -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> -			offset = ip->i_imap.im_boffset +
> -				offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -			/* need to recalc the inode CRC if appropriate */
> -			xfs_dinode_calc_crc(mp, dip);
> -
> -			xfs_trans_inode_buf(tp, ibp);
> -			xfs_trans_log_buf(tp, ibp, offset,
> -					  (offset + sizeof(xfs_agino_t) - 1));
> -			xfs_inobp_check(mp, ibp);
> -		} else {
> -			xfs_trans_brelse(tp, ibp);
> -		}
> -		/*
> -		 * Point the previous inode on the list to the next inode.
> -		 */
> -		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
> -		ASSERT(next_agino != 0);
> -		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -		/* need to recalc the inode CRC if appropriate */
> -		xfs_dinode_calc_crc(mp, last_dip);
>  
> -		xfs_trans_inode_buf(tp, last_ibp);
> -		xfs_trans_log_buf(tp, last_ibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> -		xfs_inobp_check(mp, last_ibp);
> +		/* Point the previous inode on the list to the next inode. */
> +		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
> +				next_ino, next_agino);
>  	}
>  	pag->pagi_unlinked_count--;
>  out:
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index c10478e7e49a..fbec8f0e1a9a 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
>  		  __entry->new_ptr)
>  );
>  
> +TRACE_EVENT(xfs_iunlink_update_dinode,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
> +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> +	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agino_t, agino)
> +		__field(xfs_agino_t, old_ptr)
> +		__field(xfs_agino_t, new_ptr)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->agino = agino;
> +		__entry->old_ptr = old_ptr;
> +		__entry->new_ptr = new_ptr;
> +	),
> +	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno,
> +		  __entry->agino,
> +		  __entry->old_ptr,
> +		  __entry->new_ptr)
> +);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 08/10] xfs: refactor inode update in iunlink_remove
  2019-02-04 17:59 ` [PATCH 08/10] xfs: refactor inode update in iunlink_remove Darrick J. Wong
  2019-02-04 20:58   ` Christoph Hellwig
@ 2019-02-05 14:23   ` Brian Foster
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-05 14:23 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:52AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> In xfs_iunlink_remove we have two identical calls to
> xfs_iunlink_update_inode, so move it out of the if statement to simplify
> the code some more.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |   34 +++++++++++++---------------------
>  1 file changed, 13 insertions(+), 21 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 8af5f4e989ac..fac9562ecd39 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2183,6 +2183,7 @@ xfs_iunlink_remove(
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	xfs_agino_t		next_agino;
> +	xfs_agino_t		head_agino;
>  	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
>  	int			error;
>  
> @@ -2198,24 +2199,24 @@ xfs_iunlink_remove(
>  	 * Get the index into the agi hash table for the list this inode will
>  	 * go on.  Make sure the head pointer isn't garbage.
>  	 */
> -	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> -	if (!xfs_verify_agino(mp, agno, next_agino)) {
> +	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	if (!xfs_verify_agino(mp, agno, head_agino)) {
>  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
>  				agi, sizeof(*agi));
>  		error = -EFSCORRUPTED;
>  		goto out;
>  	}
>  
> -	if (next_agino == agino) {
> -		/*
> -		 * We're at the head of the list.  Get the inode's on-disk
> -		 * buffer to see if there is anyone after us on the list.
> -		 */
> -		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
> -				&next_agino);
> -		if (error)
> -			goto out;
> +	/*
> +	 * Set our inode's next_unlinked pointer to NULL and then return
> +	 * the old pointer value so that we can update whatever was previous
> +	 * to us in the list to point to whatever was next in the list.
> +	 */
> +	error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
> +	if (error)
> +		goto out;
>  
> +	if (head_agino == agino) {
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
>  				next_agino);
> @@ -2225,20 +2226,11 @@ xfs_iunlink_remove(
>  		struct xfs_imap	imap;
>  
>  		/* We need to search the list for the inode being freed. */
> -		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
> +		error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
>  				&prev_ino, &imap, &last_dip, &last_ibp);
>  		if (error)
>  			goto out;
>  
> -		/*
> -		 * Now last_ibp points to the buffer previous to us on the
> -		 * unlinked list.  Pull us from the list.
> -		 */
> -		error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO,
> -				&next_agino);
> -		if (error)
> -			goto out;
> -
>  		/* Point the previous inode on the list to the next inode. */
>  		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
>  				prev_ino, next_agino);
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 09/10] xfs: add tracepoints for high level iunlink operations
  2019-02-04 17:59 ` [PATCH 09/10] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
  2019-02-04 20:59   ` Christoph Hellwig
@ 2019-02-05 14:24   ` Brian Foster
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-05 14:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 09:59:58AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add tracepoints so we can associate high level operations with low level
> updates.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |    2 ++
>  fs/xfs/xfs_trace.h |   25 +++++++++++++++++++++++++
>  2 files changed, 27 insertions(+)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index fac9562ecd39..b9696d762c8f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2023,6 +2023,7 @@ xfs_iunlink(
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
>  	pag = xfs_perag_get(mp, agno);
> +	trace_xfs_iunlink(ip);
>  
>  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
> @@ -2188,6 +2189,7 @@ xfs_iunlink_remove(
>  	int			error;
>  
>  	pag = xfs_perag_get(mp, agno);
> +	trace_xfs_iunlink_remove(ip);
>  
>  	/* Get the agi buffer first.  It ensures lock ordering on the list. */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index fbec8f0e1a9a..a6e384a642b1 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3423,6 +3423,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
>  		  __entry->new_ptr)
>  );
>  
> +DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> +	TP_PROTO(struct xfs_inode *ip),
> +	TP_ARGS(ip),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agino_t, agino)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> +		__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> +		__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> +	),
> +	TP_printk("dev %d:%d agno %u agino %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno, __entry->agino)
> +)
> +
> +#define DEFINE_AGINODE_EVENT(name) \
> +DEFINE_EVENT(xfs_ag_inode_class, name, \
> +	TP_PROTO(struct xfs_inode *ip), \
> +	TP_ARGS(ip))
> +DEFINE_AGINODE_EVENT(xfs_iunlink);
> +DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-04 18:00 ` [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  2019-02-04 21:08   ` Christoph Hellwig
@ 2019-02-05 14:24   ` Brian Foster
  2019-02-05 17:53     ` Darrick J. Wong
  1 sibling, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-05 14:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.
> 
> The incore list structure records "X.next_unlinked = Y" relations, with
> the rhashtable using Y to index the records.  This makes finding the
> inode X that points to a inode Y very quick.  If our cache fails to find
> anything we can always fall back on the old method.
> 
> FWIW this drastically reduces the amount of time it takes to remove
> inodes from the unlinked list.  I wrote a program to open a lot of
> O_TMPFILE files and then close them in the same order, which takes
> a very long time if we have to traverse the unlinked lists.  With the
> ptach, I see:
> 
...
> ---
>  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_inode.h       |    9 ++
>  fs/xfs/xfs_log_recover.c |   12 ++-
>  fs/xfs/xfs_mount.c       |    5 +
>  fs/xfs/xfs_mount.h       |    1 
>  5 files changed, 233 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b9696d762c8f..baee8c894447 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1880,6 +1880,167 @@ xfs_inactive(
>  	xfs_qm_dqdetach(ip);
>  }
>  
...
> +
> +static const struct rhashtable_params xfs_iunlink_hash_params = {
> +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> +	.nelem_hint		= 512,

Any reasoning behind the 512 value? It seems rather large to me, at
least until we get more into deferred inactivation and whatnot. It looks
like the rhashtable code will round this up to 1024 as well, FWIW.

I'm also wondering whether a kmem_zone might be worthwhile for
xfs_iunlink structures, but that's probably also more for when we expect
to drive deeper unlinked lists.

> +	.key_len		= sizeof(xfs_agino_t),
> +	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
> +	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> +	.automatic_shrinking	= true,
> +	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> +};
> +
...
>  /*
>   * Point the AGI unlinked bucket at an inode and log the results.  The caller
>   * is responsible for validating the old value.
> @@ -2055,6 +2216,14 @@ xfs_iunlink(
>  		if (error)
>  			goto out;
>  		ASSERT(old_agino == NULLAGINO);
> +
> +		/*
> +		 * agino has been unlinked, add a backref from the next inode
> +		 * back to agino.
> +		 */
> +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> +		if (error)
> +			goto out;

At a glance, it looks like -ENOMEM/-EEXIST is possible from
rhashtable_insert_fast(). Do we really want to translate those into a
broader operational failure, or perhaps just skip the hashtable update?
The latter seems more appropriate given that we already account for the
possibility of a missing hashtable entry on lookup.

>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2127,6 +2296,17 @@ xfs_iunlink_map_prev(
>  
>  	ASSERT(head_agino != target_agino);
>  
> +	/* See if our backref cache can find it faster. */
> +	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
> +	if (next_agino != NULLAGINO) {
> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> +				&last_ibp);
> +		if (error)
> +			return error;

Could we assert or somehow or another verify that
last_dip->di_next_unlinked points at target_agino before we proceed?

> +		goto out;
> +	}
> +
>  	next_agino = head_agino;
>  	while (next_agino != target_agino) {
>  		xfs_agino_t	unlinked_agino;
> @@ -2156,6 +2336,7 @@ xfs_iunlink_map_prev(
>  		next_agino = unlinked_agino;
>  	}
>  
> +out:
>  	/* Should never happen, but don't return garbage. */
>  	if (next_ino == NULLFSINO)
>  		return -EFSCORRUPTED;
> @@ -2218,6 +2399,20 @@ xfs_iunlink_remove(
>  	if (error)
>  		goto out;
>  
> +	/*
> +	 * If there was a backref pointing from the next inode back to this
> +	 * one, remove it because we've removed this inode from the list.
> +	 *
> +	 * Later, if this inode was in the middle of the list we'll update
> +	 * this inode's backref to point from the next inode.
> +	 */
> +	if (next_agino != NULLAGINO) {
> +		error = xfs_iunlink_change_backref(pag, next_agino,
> +				NULLAGINO);
> +		if (error)
> +			goto out;
> +	}
> +
>  	if (head_agino == agino) {
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> @@ -2236,6 +2431,18 @@ xfs_iunlink_remove(
>  		/* Point the previous inode on the list to the next inode. */
>  		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
>  				prev_ino, next_agino);
> +
> +		/*
> +		 * Now we deal with the backref for this inode.  If this inode
> +		 * pointed at a real inode, change the backref that pointed to
> +		 * us to point to our old next.  If this inode was the end of
> +		 * the list, delete the backref that pointed to us.  Note that
> +		 * change_backref takes care of deleting the backref if
> +		 * next_agino is NULLAGINO.
> +		 */

Thanks for the comment.

> +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> +		if (error)
> +			goto out;

Ok, but the whole lookup path accounts for the possibility of a missing
entry and falls back to the old on-disk walk. It doesn't look like we
handle that properly here. IOW, if the hashtable lookup ever fails and
we do fallback as such, we're guaranteed to eventually fail here.

It seems to me we need to be extra careful so as to not turn in-core
hashtable issues (whether it be external things such as -ENOENT or
internal -ENOMEM or whatever) into fatal filesystem errors.

>  	}
>  	pag->pagi_unlinked_count--;
>  out:
...
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index c634fbfea4a8..f5fb8885662f 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5078,10 +5078,20 @@ xlog_recover_process_one_iunlink(
>  	agino = be32_to_cpu(dip->di_next_unlinked);
>  	xfs_buf_relse(ibp);
>  
> -	/* Make sure the in-core data knows about this unlinked inode. */
> +	/*
> +	 * Make sure the in-core data knows about this unlinked inode.  Since
> +	 * our iunlinks recovery basically just deletes the head of a bucket
> +	 * list until the bucket is empty, we need only to add the backref from
> +	 * the current list item to the next one, if this isn't the list tail.
> +	 */
>  	pag = xfs_perag_get(mp, agno);
>  	pag->pagi_unlinked_count++;
> +	if (agino != NULLAGINO)
> +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> +				agino);

ISTM that fixing the iunlink_remove code to not rely on hashtable
entries could also alleviate the need for this hack?

>  	xfs_perag_put(pag);
> +	if (error)
> +		goto fail_iput;

Similar potential for error amplification here.

Brian

>  
>  	/*
>  	 * Prevent any DMAPI event from being sent when the reference on
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 6ca92a71c233..9a181f7ca1d5 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -151,6 +151,7 @@ xfs_free_perag(
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
>  		ASSERT(pag->pagi_unlinked_count == 0 ||
>  		       XFS_FORCED_SHUTDOWN(mp));
> +		xfs_iunlink_destroy(pag);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> @@ -229,6 +230,9 @@ xfs_initialize_perag(
>  		/* first new pag is fully initialized */
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_hash_destroy;
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> @@ -251,6 +255,7 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
>  	}
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 169069a01f3c..dcc45b441dd6 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -391,6 +391,7 @@ typedef struct xfs_perag {
>  
>  	/* unlinked inode info; lock AGI to access */
>  	unsigned int		pagi_unlinked_count;
> +	struct rhashtable	pagi_unlinked_hash;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 14:24   ` Brian Foster
@ 2019-02-05 17:53     ` Darrick J. Wong
  2019-02-05 17:57       ` Christoph Hellwig
                         ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05 17:53 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Tue, Feb 05, 2019 at 09:24:59AM -0500, Brian Foster wrote:
> On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> > 
> > The incore list structure records "X.next_unlinked = Y" relations, with
> > the rhashtable using Y to index the records.  This makes finding the
> > inode X that points to a inode Y very quick.  If our cache fails to find
> > anything we can always fall back on the old method.
> > 
> > FWIW this drastically reduces the amount of time it takes to remove
> > inodes from the unlinked list.  I wrote a program to open a lot of
> > O_TMPFILE files and then close them in the same order, which takes
> > a very long time if we have to traverse the unlinked lists.  With the
> > ptach, I see:
> > 
> ...
> > ---
> >  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_inode.h       |    9 ++
> >  fs/xfs/xfs_log_recover.c |   12 ++-
> >  fs/xfs/xfs_mount.c       |    5 +
> >  fs/xfs/xfs_mount.h       |    1 
> >  5 files changed, 233 insertions(+), 1 deletion(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index b9696d762c8f..baee8c894447 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1880,6 +1880,167 @@ xfs_inactive(
> >  	xfs_qm_dqdetach(ip);
> >  }
> >  
> ...
> > +
> > +static const struct rhashtable_params xfs_iunlink_hash_params = {
> > +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> > +	.nelem_hint		= 512,
> 
> Any reasoning behind the 512 value? It seems rather large to me, at
> least until we get more into deferred inactivation and whatnot. It looks
> like the rhashtable code will round this up to 1024 as well, FWIW.
> 
> I'm also wondering whether a kmem_zone might be worthwhile for
> xfs_iunlink structures, but that's probably also more for when we expect
> to drive deeper unlinked lists.

I picked an arbitrary value of 64 buckets * 8 items per list.  I /do/
have plans to test various values to see if there's a particular sweet
spot, though I guess this could be much lower on the assumption that
we don't expect /that/ many unlinked inodes(?)

> > +	.key_len		= sizeof(xfs_agino_t),
> > +	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
> > +	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> > +	.automatic_shrinking	= true,
> > +	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> > +};
> > +
> ...
> >  /*
> >   * Point the AGI unlinked bucket at an inode and log the results.  The caller
> >   * is responsible for validating the old value.
> > @@ -2055,6 +2216,14 @@ xfs_iunlink(
> >  		if (error)
> >  			goto out;
> >  		ASSERT(old_agino == NULLAGINO);
> > +
> > +		/*
> > +		 * agino has been unlinked, add a backref from the next inode
> > +		 * back to agino.
> > +		 */
> > +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> > +		if (error)
> > +			goto out;
> 
> At a glance, it looks like -ENOMEM/-EEXIST is possible from
> rhashtable_insert_fast(). Do we really want to translate those into a
> broader operational failure, or perhaps just skip the hashtable update?
> The latter seems more appropriate given that we already account for the
> possibility of a missing hashtable entry on lookup.

Good point, we could be much more resilient to backref cache failures
since we do have the option of doing it the slow way.

(And, I guess by extension, a debugging knob or something to disable the
cache so that we can test the bucket list walker...)

> >  	}
> >  
> >  	/* Point the head of the list to point to this inode. */
> > @@ -2127,6 +2296,17 @@ xfs_iunlink_map_prev(
> >  
> >  	ASSERT(head_agino != target_agino);
> >  
> > +	/* See if our backref cache can find it faster. */
> > +	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
> > +	if (next_agino != NULLAGINO) {
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> > +				&last_ibp);
> > +		if (error)
> > +			return error;
> 
> Could we assert or somehow or another verify that
> last_dip->di_next_unlinked points at target_agino before we proceed?

Ok.

> > +		goto out;
> > +	}
> > +
> >  	next_agino = head_agino;
> >  	while (next_agino != target_agino) {
> >  		xfs_agino_t	unlinked_agino;
> > @@ -2156,6 +2336,7 @@ xfs_iunlink_map_prev(
> >  		next_agino = unlinked_agino;
> >  	}
> >  
> > +out:
> >  	/* Should never happen, but don't return garbage. */
> >  	if (next_ino == NULLFSINO)
> >  		return -EFSCORRUPTED;
> > @@ -2218,6 +2399,20 @@ xfs_iunlink_remove(
> >  	if (error)
> >  		goto out;
> >  
> > +	/*
> > +	 * If there was a backref pointing from the next inode back to this
> > +	 * one, remove it because we've removed this inode from the list.
> > +	 *
> > +	 * Later, if this inode was in the middle of the list we'll update
> > +	 * this inode's backref to point from the next inode.
> > +	 */
> > +	if (next_agino != NULLAGINO) {
> > +		error = xfs_iunlink_change_backref(pag, next_agino,
> > +				NULLAGINO);
> > +		if (error)
> > +			goto out;
> > +	}
> > +
> >  	if (head_agino == agino) {
> >  		/* Point the head of the list to the next unlinked inode. */
> >  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> > @@ -2236,6 +2431,18 @@ xfs_iunlink_remove(
> >  		/* Point the previous inode on the list to the next inode. */
> >  		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
> >  				prev_ino, next_agino);
> > +
> > +		/*
> > +		 * Now we deal with the backref for this inode.  If this inode
> > +		 * pointed at a real inode, change the backref that pointed to
> > +		 * us to point to our old next.  If this inode was the end of
> > +		 * the list, delete the backref that pointed to us.  Note that
> > +		 * change_backref takes care of deleting the backref if
> > +		 * next_agino is NULLAGINO.
> > +		 */
> 
> Thanks for the comment.
> 
> > +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> > +		if (error)
> > +			goto out;
> 
> Ok, but the whole lookup path accounts for the possibility of a missing
> entry and falls back to the old on-disk walk. It doesn't look like we
> handle that properly here. IOW, if the hashtable lookup ever fails and
> we do fallback as such, we're guaranteed to eventually fail here.
> 
> It seems to me we need to be extra careful so as to not turn in-core
> hashtable issues (whether it be external things such as -ENOENT or
> internal -ENOMEM or whatever) into fatal filesystem errors.

<nod> I'll fix this for v3 so that backref cache failures don't shut
down the filesystem.

> >  	}
> >  	pag->pagi_unlinked_count--;
> >  out:
> ...
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index c634fbfea4a8..f5fb8885662f 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -5078,10 +5078,20 @@ xlog_recover_process_one_iunlink(
> >  	agino = be32_to_cpu(dip->di_next_unlinked);
> >  	xfs_buf_relse(ibp);
> >  
> > -	/* Make sure the in-core data knows about this unlinked inode. */
> > +	/*
> > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > +	 * list until the bucket is empty, we need only to add the backref from
> > +	 * the current list item to the next one, if this isn't the list tail.
> > +	 */
> >  	pag = xfs_perag_get(mp, agno);
> >  	pag->pagi_unlinked_count++;
> > +	if (agino != NULLAGINO)
> > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > +				agino);
> 
> ISTM that fixing the iunlink_remove code to not rely on hashtable
> entries could also alleviate the need for this hack?
> 
> >  	xfs_perag_put(pag);
> > +	if (error)
> > +		goto fail_iput;
> 
> Similar potential for error amplification here.

Agreed.

--D

> Brian
> 
> >  
> >  	/*
> >  	 * Prevent any DMAPI event from being sent when the reference on
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 6ca92a71c233..9a181f7ca1d5 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -151,6 +151,7 @@ xfs_free_perag(
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> >  		ASSERT(pag->pagi_unlinked_count == 0 ||
> >  		       XFS_FORCED_SHUTDOWN(mp));
> > +		xfs_iunlink_destroy(pag);
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > @@ -229,6 +230,9 @@ xfs_initialize_perag(
> >  		/* first new pag is fully initialized */
> >  		if (first_initialised == NULLAGNUMBER)
> >  			first_initialised = index;
> > +		error = xfs_iunlink_init(pag);
> > +		if (error)
> > +			goto out_hash_destroy;
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > @@ -251,6 +255,7 @@ xfs_initialize_perag(
> >  		if (!pag)
> >  			break;
> >  		xfs_buf_hash_destroy(pag);
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		kmem_free(pag);
> >  	}
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 169069a01f3c..dcc45b441dd6 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -391,6 +391,7 @@ typedef struct xfs_perag {
> >  
> >  	/* unlinked inode info; lock AGI to access */
> >  	unsigned int		pagi_unlinked_count;
> > +	struct rhashtable	pagi_unlinked_hash;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 17:53     ` Darrick J. Wong
@ 2019-02-05 17:57       ` Christoph Hellwig
  2019-02-05 18:17         ` Darrick J. Wong
  2019-02-05 19:06       ` Brian Foster
  2019-02-05 20:59       ` Dave Chinner
  2 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-05 17:57 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, linux-xfs

On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> Good point, we could be much more resilient to backref cache failures
> since we do have the option of doing it the slow way.

I don't think there is any failure that can happen during normal
operation as long as we use KM_SLEEP..

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 17:57       ` Christoph Hellwig
@ 2019-02-05 18:17         ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05 18:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Brian Foster, linux-xfs

On Tue, Feb 05, 2019 at 09:57:02AM -0800, Christoph Hellwig wrote:
> On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> > Good point, we could be much more resilient to backref cache failures
> > since we do have the option of doing it the slow way.
> 
> I don't think there is any failure that can happen during normal
> operation as long as we use KM_SLEEP..

I dug even deeper into the rhashtable code it looks like it can fail a
GFP_ATOMIC allocation for new buckets, which will bubble ENOMEM up to
the caller, so Brian's right that we have to handle various errors, and
should do so in a manner that doesn't kill the filesystem.

Seeing as it's a backref cache and not critical to correct operation, I
think I can change add_backref to ignore errors other than EEXIST
(because having duplicate entries is a sign that we've screwed something
up).  change_backref can absorb all the error codes, though it'll have
to handle the case that lookup_fast returns ENOENT.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 17:53     ` Darrick J. Wong
  2019-02-05 17:57       ` Christoph Hellwig
@ 2019-02-05 19:06       ` Brian Foster
  2019-02-05 19:20         ` Darrick J. Wong
  2019-02-05 20:59       ` Dave Chinner
  2 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-05 19:06 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> On Tue, Feb 05, 2019 at 09:24:59AM -0500, Brian Foster wrote:
> > On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > up unlinked processing considerably when there are a lot of inodes on
> > > the unlinked list because iunlink_remove no longer has to traverse an
> > > entire bucket list to find which inode points to the one being removed.
> > > 
> > > The incore list structure records "X.next_unlinked = Y" relations, with
> > > the rhashtable using Y to index the records.  This makes finding the
> > > inode X that points to a inode Y very quick.  If our cache fails to find
> > > anything we can always fall back on the old method.
> > > 
> > > FWIW this drastically reduces the amount of time it takes to remove
> > > inodes from the unlinked list.  I wrote a program to open a lot of
> > > O_TMPFILE files and then close them in the same order, which takes
> > > a very long time if we have to traverse the unlinked lists.  With the
> > > ptach, I see:
> > > 
> > ...
> > > ---
> > >  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_inode.h       |    9 ++
> > >  fs/xfs/xfs_log_recover.c |   12 ++-
> > >  fs/xfs/xfs_mount.c       |    5 +
> > >  fs/xfs/xfs_mount.h       |    1 
> > >  5 files changed, 233 insertions(+), 1 deletion(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index b9696d762c8f..baee8c894447 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -1880,6 +1880,167 @@ xfs_inactive(
> > >  	xfs_qm_dqdetach(ip);
> > >  }
> > >  
> > ...
> > > +
> > > +static const struct rhashtable_params xfs_iunlink_hash_params = {
> > > +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> > > +	.nelem_hint		= 512,
> > 
> > Any reasoning behind the 512 value? It seems rather large to me, at
> > least until we get more into deferred inactivation and whatnot. It looks
> > like the rhashtable code will round this up to 1024 as well, FWIW.
> > 
> > I'm also wondering whether a kmem_zone might be worthwhile for
> > xfs_iunlink structures, but that's probably also more for when we expect
> > to drive deeper unlinked lists.
> 
> I picked an arbitrary value of 64 buckets * 8 items per list.  I /do/
> have plans to test various values to see if there's a particular sweet
> spot, though I guess this could be much lower on the assumption that
> we don't expect /that/ many unlinked inodes(?)
> 

Ok. To be clear, I don't really have any insight as to what a good value
is, I was just curious where 512 came from. 8 items per bucket sounds
more reasonable when you frame it that way. This only looked like an 8k
or so allocation in the hashtable code, which seems reasonable, but then
again this is a per-ag allocation.

Hmm, I suppose if you just include something like a /* 64 buckets * 8
items per list */ comment on that line so it's clear where the number
comes from and then verify this doesn't cause a mount of an fs with some
absurd number of AGs to fall over or steal an unreasonable amount of
memory then it's probably fine.

> > > +	.key_len		= sizeof(xfs_agino_t),
> > > +	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
> > > +	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> > > +	.automatic_shrinking	= true,
> > > +	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> > > +};
> > > +
> > ...
> > >  /*
> > >   * Point the AGI unlinked bucket at an inode and log the results.  The caller
> > >   * is responsible for validating the old value.
> > > @@ -2055,6 +2216,14 @@ xfs_iunlink(
> > >  		if (error)
> > >  			goto out;
> > >  		ASSERT(old_agino == NULLAGINO);
> > > +
> > > +		/*
> > > +		 * agino has been unlinked, add a backref from the next inode
> > > +		 * back to agino.
> > > +		 */
> > > +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> > > +		if (error)
> > > +			goto out;
> > 
> > At a glance, it looks like -ENOMEM/-EEXIST is possible from
> > rhashtable_insert_fast(). Do we really want to translate those into a
> > broader operational failure, or perhaps just skip the hashtable update?
> > The latter seems more appropriate given that we already account for the
> > possibility of a missing hashtable entry on lookup.
> 
> Good point, we could be much more resilient to backref cache failures
> since we do have the option of doing it the slow way.
> 
> (And, I guess by extension, a debugging knob or something to disable the
> cache so that we can test the bucket list walker...)
> 

Good idea. An error tag on the add backref path perhaps? Then we can
cover anything from random insertion failures to turning it off
completely.

Brian

> > >  	}
> > >  
> > >  	/* Point the head of the list to point to this inode. */
> > > @@ -2127,6 +2296,17 @@ xfs_iunlink_map_prev(
> > >  
> > >  	ASSERT(head_agino != target_agino);
> > >  
> > > +	/* See if our backref cache can find it faster. */
> > > +	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
> > > +	if (next_agino != NULLAGINO) {
> > > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > > +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> > > +				&last_ibp);
> > > +		if (error)
> > > +			return error;
> > 
> > Could we assert or somehow or another verify that
> > last_dip->di_next_unlinked points at target_agino before we proceed?
> 
> Ok.
> 
> > > +		goto out;
> > > +	}
> > > +
> > >  	next_agino = head_agino;
> > >  	while (next_agino != target_agino) {
> > >  		xfs_agino_t	unlinked_agino;
> > > @@ -2156,6 +2336,7 @@ xfs_iunlink_map_prev(
> > >  		next_agino = unlinked_agino;
> > >  	}
> > >  
> > > +out:
> > >  	/* Should never happen, but don't return garbage. */
> > >  	if (next_ino == NULLFSINO)
> > >  		return -EFSCORRUPTED;
> > > @@ -2218,6 +2399,20 @@ xfs_iunlink_remove(
> > >  	if (error)
> > >  		goto out;
> > >  
> > > +	/*
> > > +	 * If there was a backref pointing from the next inode back to this
> > > +	 * one, remove it because we've removed this inode from the list.
> > > +	 *
> > > +	 * Later, if this inode was in the middle of the list we'll update
> > > +	 * this inode's backref to point from the next inode.
> > > +	 */
> > > +	if (next_agino != NULLAGINO) {
> > > +		error = xfs_iunlink_change_backref(pag, next_agino,
> > > +				NULLAGINO);
> > > +		if (error)
> > > +			goto out;
> > > +	}
> > > +
> > >  	if (head_agino == agino) {
> > >  		/* Point the head of the list to the next unlinked inode. */
> > >  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> > > @@ -2236,6 +2431,18 @@ xfs_iunlink_remove(
> > >  		/* Point the previous inode on the list to the next inode. */
> > >  		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
> > >  				prev_ino, next_agino);
> > > +
> > > +		/*
> > > +		 * Now we deal with the backref for this inode.  If this inode
> > > +		 * pointed at a real inode, change the backref that pointed to
> > > +		 * us to point to our old next.  If this inode was the end of
> > > +		 * the list, delete the backref that pointed to us.  Note that
> > > +		 * change_backref takes care of deleting the backref if
> > > +		 * next_agino is NULLAGINO.
> > > +		 */
> > 
> > Thanks for the comment.
> > 
> > > +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> > > +		if (error)
> > > +			goto out;
> > 
> > Ok, but the whole lookup path accounts for the possibility of a missing
> > entry and falls back to the old on-disk walk. It doesn't look like we
> > handle that properly here. IOW, if the hashtable lookup ever fails and
> > we do fallback as such, we're guaranteed to eventually fail here.
> > 
> > It seems to me we need to be extra careful so as to not turn in-core
> > hashtable issues (whether it be external things such as -ENOENT or
> > internal -ENOMEM or whatever) into fatal filesystem errors.
> 
> <nod> I'll fix this for v3 so that backref cache failures don't shut
> down the filesystem.
> 
> > >  	}
> > >  	pag->pagi_unlinked_count--;
> > >  out:
> > ...
> > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > > index c634fbfea4a8..f5fb8885662f 100644
> > > --- a/fs/xfs/xfs_log_recover.c
> > > +++ b/fs/xfs/xfs_log_recover.c
> > > @@ -5078,10 +5078,20 @@ xlog_recover_process_one_iunlink(
> > >  	agino = be32_to_cpu(dip->di_next_unlinked);
> > >  	xfs_buf_relse(ibp);
> > >  
> > > -	/* Make sure the in-core data knows about this unlinked inode. */
> > > +	/*
> > > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > > +	 * list until the bucket is empty, we need only to add the backref from
> > > +	 * the current list item to the next one, if this isn't the list tail.
> > > +	 */
> > >  	pag = xfs_perag_get(mp, agno);
> > >  	pag->pagi_unlinked_count++;
> > > +	if (agino != NULLAGINO)
> > > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > > +				agino);
> > 
> > ISTM that fixing the iunlink_remove code to not rely on hashtable
> > entries could also alleviate the need for this hack?
> > 
> > >  	xfs_perag_put(pag);
> > > +	if (error)
> > > +		goto fail_iput;
> > 
> > Similar potential for error amplification here.
> 
> Agreed.
> 
> --D
> 
> > Brian
> > 
> > >  
> > >  	/*
> > >  	 * Prevent any DMAPI event from being sent when the reference on
> > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > index 6ca92a71c233..9a181f7ca1d5 100644
> > > --- a/fs/xfs/xfs_mount.c
> > > +++ b/fs/xfs/xfs_mount.c
> > > @@ -151,6 +151,7 @@ xfs_free_perag(
> > >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> > >  		ASSERT(pag->pagi_unlinked_count == 0 ||
> > >  		       XFS_FORCED_SHUTDOWN(mp));
> > > +		xfs_iunlink_destroy(pag);
> > >  		xfs_buf_hash_destroy(pag);
> > >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > > @@ -229,6 +230,9 @@ xfs_initialize_perag(
> > >  		/* first new pag is fully initialized */
> > >  		if (first_initialised == NULLAGNUMBER)
> > >  			first_initialised = index;
> > > +		error = xfs_iunlink_init(pag);
> > > +		if (error)
> > > +			goto out_hash_destroy;
> > >  	}
> > >  
> > >  	index = xfs_set_inode_alloc(mp, agcount);
> > > @@ -251,6 +255,7 @@ xfs_initialize_perag(
> > >  		if (!pag)
> > >  			break;
> > >  		xfs_buf_hash_destroy(pag);
> > > +		xfs_iunlink_destroy(pag);
> > >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > >  		kmem_free(pag);
> > >  	}
> > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > index 169069a01f3c..dcc45b441dd6 100644
> > > --- a/fs/xfs/xfs_mount.h
> > > +++ b/fs/xfs/xfs_mount.h
> > > @@ -391,6 +391,7 @@ typedef struct xfs_perag {
> > >  
> > >  	/* unlinked inode info; lock AGI to access */
> > >  	unsigned int		pagi_unlinked_count;
> > > +	struct rhashtable	pagi_unlinked_hash;
> > >  } xfs_perag_t;
> > >  
> > >  static inline struct xfs_ag_resv *
> > > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 19:06       ` Brian Foster
@ 2019-02-05 19:20         ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-05 19:20 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Tue, Feb 05, 2019 at 02:06:27PM -0500, Brian Foster wrote:
> On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> > On Tue, Feb 05, 2019 at 09:24:59AM -0500, Brian Foster wrote:
> > > On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > > up unlinked processing considerably when there are a lot of inodes on
> > > > the unlinked list because iunlink_remove no longer has to traverse an
> > > > entire bucket list to find which inode points to the one being removed.
> > > > 
> > > > The incore list structure records "X.next_unlinked = Y" relations, with
> > > > the rhashtable using Y to index the records.  This makes finding the
> > > > inode X that points to a inode Y very quick.  If our cache fails to find
> > > > anything we can always fall back on the old method.
> > > > 
> > > > FWIW this drastically reduces the amount of time it takes to remove
> > > > inodes from the unlinked list.  I wrote a program to open a lot of
> > > > O_TMPFILE files and then close them in the same order, which takes
> > > > a very long time if we have to traverse the unlinked lists.  With the
> > > > ptach, I see:
> > > > 
> > > ...
> > > > ---
> > > >  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_inode.h       |    9 ++
> > > >  fs/xfs/xfs_log_recover.c |   12 ++-
> > > >  fs/xfs/xfs_mount.c       |    5 +
> > > >  fs/xfs/xfs_mount.h       |    1 
> > > >  5 files changed, 233 insertions(+), 1 deletion(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index b9696d762c8f..baee8c894447 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > > @@ -1880,6 +1880,167 @@ xfs_inactive(
> > > >  	xfs_qm_dqdetach(ip);
> > > >  }
> > > >  
> > > ...
> > > > +
> > > > +static const struct rhashtable_params xfs_iunlink_hash_params = {
> > > > +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> > > > +	.nelem_hint		= 512,
> > > 
> > > Any reasoning behind the 512 value? It seems rather large to me, at
> > > least until we get more into deferred inactivation and whatnot. It looks
> > > like the rhashtable code will round this up to 1024 as well, FWIW.
> > > 
> > > I'm also wondering whether a kmem_zone might be worthwhile for
> > > xfs_iunlink structures, but that's probably also more for when we expect
> > > to drive deeper unlinked lists.
> > 
> > I picked an arbitrary value of 64 buckets * 8 items per list.  I /do/
> > have plans to test various values to see if there's a particular sweet
> > spot, though I guess this could be much lower on the assumption that
> > we don't expect /that/ many unlinked inodes(?)
> > 
> 
> Ok. To be clear, I don't really have any insight as to what a good value
> is, I was just curious where 512 came from. 8 items per bucket sounds
> more reasonable when you frame it that way. This only looked like an 8k
> or so allocation in the hashtable code, which seems reasonable, but then
> again this is a per-ag allocation.

Good point, we should keep our rhashtable heads smaller than a page.
I'll turn this down to 256.  FWIW I tried exercising 64 -> 128 -> 256 ->
512 -> 1024 (factoring in the 4/3 thing) and it didn't really seem to
make much of a difference.

> Hmm, I suppose if you just include something like a /* 64 buckets * 8
> items per list */ comment on that line so it's clear where the number
> comes from and then verify this doesn't cause a mount of an fs with some
> absurd number of AGs to fall over or steal an unreasonable amount of
> memory then it's probably fine.

Ok.

> > > > +	.key_len		= sizeof(xfs_agino_t),
> > > > +	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
> > > > +	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> > > > +	.automatic_shrinking	= true,
> > > > +	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> > > > +};
> > > > +
> > > ...
> > > >  /*
> > > >   * Point the AGI unlinked bucket at an inode and log the results.  The caller
> > > >   * is responsible for validating the old value.
> > > > @@ -2055,6 +2216,14 @@ xfs_iunlink(
> > > >  		if (error)
> > > >  			goto out;
> > > >  		ASSERT(old_agino == NULLAGINO);
> > > > +
> > > > +		/*
> > > > +		 * agino has been unlinked, add a backref from the next inode
> > > > +		 * back to agino.
> > > > +		 */
> > > > +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> > > > +		if (error)
> > > > +			goto out;
> > > 
> > > At a glance, it looks like -ENOMEM/-EEXIST is possible from
> > > rhashtable_insert_fast(). Do we really want to translate those into a
> > > broader operational failure, or perhaps just skip the hashtable update?
> > > The latter seems more appropriate given that we already account for the
> > > possibility of a missing hashtable entry on lookup.
> > 
> > Good point, we could be much more resilient to backref cache failures
> > since we do have the option of doing it the slow way.
> > 
> > (And, I guess by extension, a debugging knob or something to disable the
> > cache so that we can test the bucket list walker...)
> > 
> 
> Good idea. An error tag on the add backref path perhaps? Then we can
> cover anything from random insertion failures to turning it off
> completely.

Ok.

--D

> Brian
> 
> > > >  	}
> > > >  
> > > >  	/* Point the head of the list to point to this inode. */
> > > > @@ -2127,6 +2296,17 @@ xfs_iunlink_map_prev(
> > > >  
> > > >  	ASSERT(head_agino != target_agino);
> > > >  
> > > > +	/* See if our backref cache can find it faster. */
> > > > +	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
> > > > +	if (next_agino != NULLAGINO) {
> > > > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > > > +		error = xfs_iunlink_map_ino(tp, next_ino, imap, &last_dip,
> > > > +				&last_ibp);
> > > > +		if (error)
> > > > +			return error;
> > > 
> > > Could we assert or somehow or another verify that
> > > last_dip->di_next_unlinked points at target_agino before we proceed?
> > 
> > Ok.
> > 
> > > > +		goto out;
> > > > +	}
> > > > +
> > > >  	next_agino = head_agino;
> > > >  	while (next_agino != target_agino) {
> > > >  		xfs_agino_t	unlinked_agino;
> > > > @@ -2156,6 +2336,7 @@ xfs_iunlink_map_prev(
> > > >  		next_agino = unlinked_agino;
> > > >  	}
> > > >  
> > > > +out:
> > > >  	/* Should never happen, but don't return garbage. */
> > > >  	if (next_ino == NULLFSINO)
> > > >  		return -EFSCORRUPTED;
> > > > @@ -2218,6 +2399,20 @@ xfs_iunlink_remove(
> > > >  	if (error)
> > > >  		goto out;
> > > >  
> > > > +	/*
> > > > +	 * If there was a backref pointing from the next inode back to this
> > > > +	 * one, remove it because we've removed this inode from the list.
> > > > +	 *
> > > > +	 * Later, if this inode was in the middle of the list we'll update
> > > > +	 * this inode's backref to point from the next inode.
> > > > +	 */
> > > > +	if (next_agino != NULLAGINO) {
> > > > +		error = xfs_iunlink_change_backref(pag, next_agino,
> > > > +				NULLAGINO);
> > > > +		if (error)
> > > > +			goto out;
> > > > +	}
> > > > +
> > > >  	if (head_agino == agino) {
> > > >  		/* Point the head of the list to the next unlinked inode. */
> > > >  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> > > > @@ -2236,6 +2431,18 @@ xfs_iunlink_remove(
> > > >  		/* Point the previous inode on the list to the next inode. */
> > > >  		xfs_iunlink_update_dinode(tp, agno, last_ibp, last_dip, &imap,
> > > >  				prev_ino, next_agino);
> > > > +
> > > > +		/*
> > > > +		 * Now we deal with the backref for this inode.  If this inode
> > > > +		 * pointed at a real inode, change the backref that pointed to
> > > > +		 * us to point to our old next.  If this inode was the end of
> > > > +		 * the list, delete the backref that pointed to us.  Note that
> > > > +		 * change_backref takes care of deleting the backref if
> > > > +		 * next_agino is NULLAGINO.
> > > > +		 */
> > > 
> > > Thanks for the comment.
> > > 
> > > > +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> > > > +		if (error)
> > > > +			goto out;
> > > 
> > > Ok, but the whole lookup path accounts for the possibility of a missing
> > > entry and falls back to the old on-disk walk. It doesn't look like we
> > > handle that properly here. IOW, if the hashtable lookup ever fails and
> > > we do fallback as such, we're guaranteed to eventually fail here.
> > > 
> > > It seems to me we need to be extra careful so as to not turn in-core
> > > hashtable issues (whether it be external things such as -ENOENT or
> > > internal -ENOMEM or whatever) into fatal filesystem errors.
> > 
> > <nod> I'll fix this for v3 so that backref cache failures don't shut
> > down the filesystem.
> > 
> > > >  	}
> > > >  	pag->pagi_unlinked_count--;
> > > >  out:
> > > ...
> > > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > > > index c634fbfea4a8..f5fb8885662f 100644
> > > > --- a/fs/xfs/xfs_log_recover.c
> > > > +++ b/fs/xfs/xfs_log_recover.c
> > > > @@ -5078,10 +5078,20 @@ xlog_recover_process_one_iunlink(
> > > >  	agino = be32_to_cpu(dip->di_next_unlinked);
> > > >  	xfs_buf_relse(ibp);
> > > >  
> > > > -	/* Make sure the in-core data knows about this unlinked inode. */
> > > > +	/*
> > > > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > > > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > > > +	 * list until the bucket is empty, we need only to add the backref from
> > > > +	 * the current list item to the next one, if this isn't the list tail.
> > > > +	 */
> > > >  	pag = xfs_perag_get(mp, agno);
> > > >  	pag->pagi_unlinked_count++;
> > > > +	if (agino != NULLAGINO)
> > > > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > > > +				agino);
> > > 
> > > ISTM that fixing the iunlink_remove code to not rely on hashtable
> > > entries could also alleviate the need for this hack?
> > > 
> > > >  	xfs_perag_put(pag);
> > > > +	if (error)
> > > > +		goto fail_iput;
> > > 
> > > Similar potential for error amplification here.
> > 
> > Agreed.
> > 
> > --D
> > 
> > > Brian
> > > 
> > > >  
> > > >  	/*
> > > >  	 * Prevent any DMAPI event from being sent when the reference on
> > > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > > index 6ca92a71c233..9a181f7ca1d5 100644
> > > > --- a/fs/xfs/xfs_mount.c
> > > > +++ b/fs/xfs/xfs_mount.c
> > > > @@ -151,6 +151,7 @@ xfs_free_perag(
> > > >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> > > >  		ASSERT(pag->pagi_unlinked_count == 0 ||
> > > >  		       XFS_FORCED_SHUTDOWN(mp));
> > > > +		xfs_iunlink_destroy(pag);
> > > >  		xfs_buf_hash_destroy(pag);
> > > >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > > >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > > > @@ -229,6 +230,9 @@ xfs_initialize_perag(
> > > >  		/* first new pag is fully initialized */
> > > >  		if (first_initialised == NULLAGNUMBER)
> > > >  			first_initialised = index;
> > > > +		error = xfs_iunlink_init(pag);
> > > > +		if (error)
> > > > +			goto out_hash_destroy;
> > > >  	}
> > > >  
> > > >  	index = xfs_set_inode_alloc(mp, agcount);
> > > > @@ -251,6 +255,7 @@ xfs_initialize_perag(
> > > >  		if (!pag)
> > > >  			break;
> > > >  		xfs_buf_hash_destroy(pag);
> > > > +		xfs_iunlink_destroy(pag);
> > > >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > > >  		kmem_free(pag);
> > > >  	}
> > > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > > index 169069a01f3c..dcc45b441dd6 100644
> > > > --- a/fs/xfs/xfs_mount.h
> > > > +++ b/fs/xfs/xfs_mount.h
> > > > @@ -391,6 +391,7 @@ typedef struct xfs_perag {
> > > >  
> > > >  	/* unlinked inode info; lock AGI to access */
> > > >  	unsigned int		pagi_unlinked_count;
> > > > +	struct rhashtable	pagi_unlinked_hash;
> > > >  } xfs_perag_t;
> > > >  
> > > >  static inline struct xfs_ag_resv *
> > > > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 17:53     ` Darrick J. Wong
  2019-02-05 17:57       ` Christoph Hellwig
  2019-02-05 19:06       ` Brian Foster
@ 2019-02-05 20:59       ` Dave Chinner
  2019-02-06 18:25         ` Darrick J. Wong
  2 siblings, 1 reply; 42+ messages in thread
From: Dave Chinner @ 2019-02-05 20:59 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, linux-xfs

On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> On Tue, Feb 05, 2019 at 09:24:59AM -0500, Brian Foster wrote:
> > On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > up unlinked processing considerably when there are a lot of inodes on
> > > the unlinked list because iunlink_remove no longer has to traverse an
> > > entire bucket list to find which inode points to the one being removed.
> > > 
> > > The incore list structure records "X.next_unlinked = Y" relations, with
> > > the rhashtable using Y to index the records.  This makes finding the
> > > inode X that points to a inode Y very quick.  If our cache fails to find
> > > anything we can always fall back on the old method.
> > > 
> > > FWIW this drastically reduces the amount of time it takes to remove
> > > inodes from the unlinked list.  I wrote a program to open a lot of
> > > O_TMPFILE files and then close them in the same order, which takes
> > > a very long time if we have to traverse the unlinked lists.  With the
> > > ptach, I see:
> > > 
> > ...
> > > ---
> > >  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_inode.h       |    9 ++
> > >  fs/xfs/xfs_log_recover.c |   12 ++-
> > >  fs/xfs/xfs_mount.c       |    5 +
> > >  fs/xfs/xfs_mount.h       |    1 
> > >  5 files changed, 233 insertions(+), 1 deletion(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index b9696d762c8f..baee8c894447 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -1880,6 +1880,167 @@ xfs_inactive(
> > >  	xfs_qm_dqdetach(ip);
> > >  }
> > >  
> > ...
> > > +
> > > +static const struct rhashtable_params xfs_iunlink_hash_params = {
> > > +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> > > +	.nelem_hint		= 512,
> > 
> > Any reasoning behind the 512 value? It seems rather large to me, at
> > least until we get more into deferred inactivation and whatnot. It looks
> > like the rhashtable code will round this up to 1024 as well, FWIW.
> > 
> > I'm also wondering whether a kmem_zone might be worthwhile for
> > xfs_iunlink structures, but that's probably also more for when we expect
> > to drive deeper unlinked lists.
> 
> I picked an arbitrary value of 64 buckets * 8 items per list.  I /do/
> have plans to test various values to see if there's a particular sweet
> spot, though I guess this could be much lower on the assumption that
> we don't expect /that/ many unlinked inodes(?)

Seems pretty large, given we use this for the per-ag buffer cache
rhashtable:

     .min_size               = 32,   /* empty AGs have minimal footprint */
     .nelem_hint             = 16,

And nobody notices problems when they grow and shrink and they run
from empty to hundreds of thousands of entries and back again in
very short preiods of time. Hence I'd suggest that we make it as
small as possible to begin with and then only change things if there
are performance problems triggered by growing and shrinking....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable
  2019-02-05 20:59       ` Dave Chinner
@ 2019-02-06 18:25         ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-06 18:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, linux-xfs

On Wed, Feb 06, 2019 at 07:59:15AM +1100, Dave Chinner wrote:
> On Tue, Feb 05, 2019 at 09:53:09AM -0800, Darrick J. Wong wrote:
> > On Tue, Feb 05, 2019 at 09:24:59AM -0500, Brian Foster wrote:
> > > On Mon, Feb 04, 2019 at 10:00:05AM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > > up unlinked processing considerably when there are a lot of inodes on
> > > > the unlinked list because iunlink_remove no longer has to traverse an
> > > > entire bucket list to find which inode points to the one being removed.
> > > > 
> > > > The incore list structure records "X.next_unlinked = Y" relations, with
> > > > the rhashtable using Y to index the records.  This makes finding the
> > > > inode X that points to a inode Y very quick.  If our cache fails to find
> > > > anything we can always fall back on the old method.
> > > > 
> > > > FWIW this drastically reduces the amount of time it takes to remove
> > > > inodes from the unlinked list.  I wrote a program to open a lot of
> > > > O_TMPFILE files and then close them in the same order, which takes
> > > > a very long time if we have to traverse the unlinked lists.  With the
> > > > ptach, I see:
> > > > 
> > > ...
> > > > ---
> > > >  fs/xfs/xfs_inode.c       |  207 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_inode.h       |    9 ++
> > > >  fs/xfs/xfs_log_recover.c |   12 ++-
> > > >  fs/xfs/xfs_mount.c       |    5 +
> > > >  fs/xfs/xfs_mount.h       |    1 
> > > >  5 files changed, 233 insertions(+), 1 deletion(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index b9696d762c8f..baee8c894447 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > > @@ -1880,6 +1880,167 @@ xfs_inactive(
> > > >  	xfs_qm_dqdetach(ip);
> > > >  }
> > > >  
> > > ...
> > > > +
> > > > +static const struct rhashtable_params xfs_iunlink_hash_params = {
> > > > +	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> > > > +	.nelem_hint		= 512,
> > > 
> > > Any reasoning behind the 512 value? It seems rather large to me, at
> > > least until we get more into deferred inactivation and whatnot. It looks
> > > like the rhashtable code will round this up to 1024 as well, FWIW.
> > > 
> > > I'm also wondering whether a kmem_zone might be worthwhile for
> > > xfs_iunlink structures, but that's probably also more for when we expect
> > > to drive deeper unlinked lists.
> > 
> > I picked an arbitrary value of 64 buckets * 8 items per list.  I /do/
> > have plans to test various values to see if there's a particular sweet
> > spot, though I guess this could be much lower on the assumption that
> > we don't expect /that/ many unlinked inodes(?)
> 
> Seems pretty large, given we use this for the per-ag buffer cache
> rhashtable:
> 
>      .min_size               = 32,   /* empty AGs have minimal footprint */
>      .nelem_hint             = 16,
> 
> And nobody notices problems when they grow and shrink and they run
> from empty to hundreds of thousands of entries and back again in
> very short preiods of time. Hence I'd suggest that we make it as
> small as possible to begin with and then only change things if there
> are performance problems triggered by growing and shrinking....

Yeah, I'll change the patch to remove the nelem_hint, which will get us
a hashtable of size >= 64.

(Ok, I already sent the patches, I just forgot to reply to this.)

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2019-02-06 18:25 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-04 17:58 [PATCH v2 00/10] xfs: incore unlinked list Darrick J. Wong
2019-02-04 17:59 ` [PATCH 01/10] xfs: clean up iunlink functions Darrick J. Wong
2019-02-04 19:24   ` Brian Foster
2019-02-04 20:51   ` Christoph Hellwig
2019-02-04 17:59 ` [PATCH 02/10] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
2019-02-04 19:24   ` Brian Foster
2019-02-04 20:53     ` Christoph Hellwig
2019-02-05  0:26       ` Darrick J. Wong
2019-02-05  6:55         ` Christoph Hellwig
2019-02-05  7:07           ` Darrick J. Wong
2019-02-05  0:42     ` Darrick J. Wong
2019-02-04 17:59 ` [PATCH 03/10] xfs: add xfs_verify_agino_or_null helper Darrick J. Wong
2019-02-04 19:24   ` Brian Foster
2019-02-04 20:53   ` Christoph Hellwig
2019-02-04 17:59 ` [PATCH 04/10] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
2019-02-04 19:25   ` Brian Foster
2019-02-04 20:54   ` Christoph Hellwig
2019-02-04 17:59 ` [PATCH 05/10] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
2019-02-04 17:59 ` [PATCH 06/10] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
2019-02-04 20:56   ` Christoph Hellwig
2019-02-04 21:49     ` Darrick J. Wong
2019-02-05 14:23   ` Brian Foster
2019-02-04 17:59 ` [PATCH 07/10] xfs: refactor unlinked list search and mapping to a separate function Darrick J. Wong
2019-02-04 20:58   ` Christoph Hellwig
2019-02-04 22:23     ` Darrick J. Wong
2019-02-04 17:59 ` [PATCH 08/10] xfs: refactor inode update in iunlink_remove Darrick J. Wong
2019-02-04 20:58   ` Christoph Hellwig
2019-02-05 14:23   ` Brian Foster
2019-02-04 17:59 ` [PATCH 09/10] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
2019-02-04 20:59   ` Christoph Hellwig
2019-02-05 14:24   ` Brian Foster
2019-02-04 18:00 ` [PATCH 10/10] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
2019-02-04 21:08   ` Christoph Hellwig
2019-02-05  1:02     ` Darrick J. Wong
2019-02-05 14:24   ` Brian Foster
2019-02-05 17:53     ` Darrick J. Wong
2019-02-05 17:57       ` Christoph Hellwig
2019-02-05 18:17         ` Darrick J. Wong
2019-02-05 19:06       ` Brian Foster
2019-02-05 19:20         ` Darrick J. Wong
2019-02-05 20:59       ` Dave Chinner
2019-02-06 18:25         ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.