All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/8] xfs: incore unlinked list
@ 2019-01-31 23:17 Darrick J. Wong
  2019-01-31 23:17 ` [PATCH 1/8] xfs: clean up iunlink functions Darrick J. Wong
                   ` (8 more replies)
  0 siblings, 9 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:17 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

This new patch series refactors the existing code that handles metadata
updates to the unlinked list when adding or removing inodes from that
list.  It then adds an in-core hashtable to record which inode's
next_unlinked field points to a given inode.  This enables us to remove
any inode from the on-disk unlinked structure without having to actually
walk the entire unlinked list, which reduces overhead substantially.

If you're going to start using this mess, you probably ought to just
pull from my git trees: kernel[1], xfsprogs[2], and xfstests[3].

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

[1] https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=incore-unlinked-list
[2] https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git
[3] https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/8] xfs: clean up iunlink functions
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
@ 2019-01-31 23:17 ` Darrick J. Wong
  2019-02-01  8:01   ` Christoph Hellwig
  2019-01-31 23:18 ` [PATCH 2/8] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:17 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Fix some indentation issues with the iunlink functions and reorganize
the tops of the functions to be identical.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   77 +++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8bf02be0003..d18354517320 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1892,26 +1892,32 @@ xfs_inactive(
  */
 STATIC int
 xfs_iunlink(
-	struct xfs_trans *tp,
-	struct xfs_inode *ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_mount_t	*mp = tp->t_mountp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agino_t	agino;
-	short		bucket_index;
-	int		offset;
-	int		error;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*agibp;
+	struct xfs_buf		*ibp;
+	xfs_agnumber_t		agno;
+	xfs_agino_t		agino;
+	short			bucket_index;
+	int			offset;
+	int			error;
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
+	ASSERT(xfs_verify_ino(mp, ip->i_ino));
+
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
 	 * on the list.
 	 */
-	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(agibp);
@@ -1920,9 +1926,6 @@ xfs_iunlink(
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	ASSERT(agino != 0);
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	ASSERT(agi->agi_unlinked[bucket_index]);
 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
 
@@ -1969,26 +1972,31 @@ xfs_iunlink(
  */
 STATIC int
 xfs_iunlink_remove(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_ino_t	next_ino;
-	xfs_mount_t	*mp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agnumber_t	agno;
-	xfs_agino_t	agino;
-	xfs_agino_t	next_agino;
-	xfs_buf_t	*last_ibp;
-	xfs_dinode_t	*last_dip = NULL;
-	short		bucket_index;
-	int		offset, last_offset = 0;
-	int		error;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*agibp;
+	struct xfs_buf		*ibp;
+	struct xfs_buf		*last_ibp;
+	struct xfs_dinode	*last_dip = NULL;
+	xfs_ino_t		next_ino;
+	xfs_agnumber_t		agno;
+	xfs_agino_t		agino;
+	xfs_agino_t		next_agino;
+	short			bucket_index;
+	int			offset;
+	int			last_offset = 0;
+	int			error;
+
+	if (!xfs_verify_ino(mp, ip->i_ino))
+		return -EFSCORRUPTED;
 
-	mp = tp->t_mountp;
 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
@@ -1997,17 +2005,12 @@ xfs_iunlink_remove(
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
-
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	if (!xfs_verify_agino(mp, agno, agino))
-		return -EFSCORRUPTED;
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	if (!xfs_verify_agino(mp, agno,
 			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 2/8] xfs: track unlinked inode counts in per-ag data
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
  2019-01-31 23:17 ` [PATCH 1/8] xfs: clean up iunlink functions Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 18:59   ` Brian Foster
  2019-01-31 23:18 ` [PATCH 3/8] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Track the number of unlinked inodes in each AG so that we can use these
decisions to throttle inactivations when the unlinked list gets long.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c       |   40 +++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_log_recover.c |    8 ++++++++
 fs/xfs/xfs_mount.c       |    5 +++++
 fs/xfs/xfs_mount.h       |    4 ++++
 4 files changed, 46 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d18354517320..98355f5f9253 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1900,6 +1900,7 @@ xfs_iunlink(
 	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
 	struct xfs_buf		*ibp;
+	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 	xfs_agino_t		agino;
 	short			bucket_index;
@@ -1912,6 +1913,8 @@ xfs_iunlink(
 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	pag = xfs_perag_get(mp, agno);
+	mutex_lock(&pag->pagi_unlinked_lock);
 
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
@@ -1919,7 +1922,7 @@ xfs_iunlink(
 	 */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
-		return error;
+		goto out_unlock;
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
@@ -1939,7 +1942,7 @@ xfs_iunlink(
 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
 				       0, 0);
 		if (error)
-			return error;
+			goto out_unlock;
 
 		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
@@ -1964,7 +1967,12 @@ xfs_iunlink(
 		(sizeof(xfs_agino_t) * bucket_index);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
-	return 0;
+	pag->pagi_unlinked_count++;
+
+out_unlock:
+	mutex_unlock(&pag->pagi_unlinked_lock);
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
@@ -1982,6 +1990,7 @@ xfs_iunlink_remove(
 	struct xfs_buf		*ibp;
 	struct xfs_buf		*last_ibp;
 	struct xfs_dinode	*last_dip = NULL;
+	struct xfs_perag	*pag;
 	xfs_ino_t		next_ino;
 	xfs_agnumber_t		agno;
 	xfs_agino_t		agino;
@@ -1997,6 +2006,8 @@ xfs_iunlink_remove(
 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	pag = xfs_perag_get(mp, agno);
+	mutex_lock(&pag->pagi_unlinked_lock);
 
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
@@ -2004,7 +2015,7 @@ xfs_iunlink_remove(
 	 */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
-		return error;
+		goto out_unlock;
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
@@ -2015,7 +2026,8 @@ xfs_iunlink_remove(
 			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
-		return -EFSCORRUPTED;
+		error = -EFSCORRUPTED;
+		goto out_unlock;
 	}
 
 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
@@ -2031,7 +2043,7 @@ xfs_iunlink_remove(
 		if (error) {
 			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
 				__func__, error);
-			return error;
+			goto out_unlock;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
@@ -2080,7 +2092,7 @@ xfs_iunlink_remove(
 				xfs_warn(mp,
 	"%s: xfs_imap returned error %d.",
 					 __func__, error);
-				return error;
+				goto out_unlock;
 			}
 
 			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
@@ -2089,7 +2101,7 @@ xfs_iunlink_remove(
 				xfs_warn(mp,
 	"%s: xfs_imap_to_bp returned error %d.",
 					__func__, error);
-				return error;
+				goto out_unlock;
 			}
 
 			last_offset = imap.im_boffset;
@@ -2098,7 +2110,8 @@ xfs_iunlink_remove(
 				XFS_CORRUPTION_ERROR(__func__,
 						XFS_ERRLEVEL_LOW, mp,
 						last_dip, sizeof(*last_dip));
-				return -EFSCORRUPTED;
+				error = -EFSCORRUPTED;
+				goto out_unlock;
 			}
 		}
 
@@ -2111,7 +2124,7 @@ xfs_iunlink_remove(
 		if (error) {
 			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
 				__func__, error);
-			return error;
+			goto out_unlock;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
@@ -2146,7 +2159,12 @@ xfs_iunlink_remove(
 				  (offset + sizeof(xfs_agino_t) - 1));
 		xfs_inobp_check(mp, last_ibp);
 	}
-	return 0;
+	pag->pagi_unlinked_count--;
+
+out_unlock:
+	mutex_unlock(&pag->pagi_unlinked_lock);
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ff9a27834c50..c920b8aeba01 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
 	struct xfs_buf			*ibp;
 	struct xfs_dinode		*dip;
 	struct xfs_inode		*ip;
+	struct xfs_perag		*pag;
 	xfs_ino_t			ino;
 	int				error;
 
@@ -5077,6 +5078,13 @@ xlog_recover_process_one_iunlink(
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 
+	/* Make sure the in-core data knows about this unlinked inode. */
+	pag = xfs_perag_get(mp, agno);
+	mutex_lock(&pag->pagi_unlinked_lock);
+	pag->pagi_unlinked_count++;
+	mutex_unlock(&pag->pagi_unlinked_lock);
+	xfs_perag_put(pag);
+
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
 	 * the inode is dropped.
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10be706ec72e..6bfc985669e0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,9 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		ASSERT(pag->pagi_unlinked_count == 0 ||
+		       XFS_FORCED_SHUTDOWN(mp));
+		mutex_destroy(&pag->pagi_unlinked_lock);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +230,7 @@ xfs_initialize_perag(
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
+		mutex_init(&pag->pagi_unlinked_lock);
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		mutex_destroy(&pag->pagi_unlinked_lock);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e344b1dfde63..0fcc6b6a4f67 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -388,6 +388,10 @@ typedef struct xfs_perag {
 
 	/* reference count */
 	uint8_t			pagf_refcount_level;
+
+	/* unlinked inodes */
+	struct mutex		pagi_unlinked_lock;
+	uint32_t		pagi_unlinked_count;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 3/8] xfs: refactor AGI unlinked bucket updates
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
  2019-01-31 23:17 ` [PATCH 1/8] xfs: clean up iunlink functions Darrick J. Wong
  2019-01-31 23:18 ` [PATCH 2/8] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 19:00   ` Brian Foster
  2019-02-02 16:21   ` Christoph Hellwig
  2019-01-31 23:18 ` [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Split the AGI unlinked bucket updates into a separate function.  No
functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   69 ++++++++++++++++++++++++++++++++++++++--------------
 fs/xfs/xfs_trace.h |   26 ++++++++++++++++++++
 2 files changed, 76 insertions(+), 19 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 98355f5f9253..3c33136b21ef 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1880,6 +1880,46 @@ xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(new_agino == NULLAGINO ||
+	       xfs_verify_agino(tp->t_mountp, be32_to_cpu(agi->agi_seqno),
+				new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp,
+			be32_to_cpu(agi->agi_seqno), bucket_index, old_value,
+			new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino)
+		return -EFSCORRUPTED;
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset,
+			(offset + sizeof(xfs_agino_t) - 1));
+	return 0;
+}
+
 /*
  * This is called when the inode's link count goes to 0 or we are creating a
  * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
@@ -1958,15 +1998,10 @@ xfs_iunlink(
 		xfs_inobp_check(mp, ibp);
 	}
 
-	/*
-	 * Point the bucket head pointer at the inode being inserted.
-	 */
-	ASSERT(agino != 0);
-	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-	offset = offsetof(xfs_agi_t, agi_unlinked) +
-		(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_log_buf(tp, agibp, offset,
-			  (offset + sizeof(xfs_agino_t) - 1));
+	/* Point the head of the list to point to this inode. */
+	error = xfs_iunlink_update_bucket(tp, agibp, bucket_index, agino);
+	if (error)
+		goto out_unlock;
 	pag->pagi_unlinked_count++;
 
 out_unlock:
@@ -2062,16 +2097,12 @@ xfs_iunlink_remove(
 		} else {
 			xfs_trans_brelse(tp, ibp);
 		}
-		/*
-		 * Point the bucket head pointer at the next inode.
-		 */
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
-		offset = offsetof(xfs_agi_t, agi_unlinked) +
-			(sizeof(xfs_agino_t) * bucket_index);
-		xfs_trans_log_buf(tp, agibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
+
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
+				next_agino);
+		if (error)
+			goto out_unlock;
 	} else {
 		/*
 		 * We need to search the list for the inode being freed.
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..c10478e7e49a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
 DEFINE_TRANS_EVENT(xfs_trans_add_item);
 DEFINE_TRANS_EVENT(xfs_trans_free_items);
 
+TRACE_EVENT(xfs_iunlink_update_bucket,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, bucket)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bucket = bucket;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bucket,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (2 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 3/8] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 19:00   ` Brian Foster
  2019-02-02 16:22   ` Christoph Hellwig
  2019-01-31 23:18 ` [PATCH 5/8] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
                   ` (4 subsequent siblings)
  8 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Strengthen our checking of the AGI unlinked pointers when we start to
use them for updating the metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3c33136b21ef..4ddda3f3255f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1942,6 +1942,7 @@ xfs_iunlink(
 	struct xfs_buf		*ibp;
 	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
+	xfs_agino_t		next_agino;
 	xfs_agino_t		agino;
 	short			bucket_index;
 	int			offset;
@@ -1966,13 +1967,19 @@ xfs_iunlink(
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
 	 */
-	ASSERT(agi->agi_unlinked[bucket_index]);
-	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    (next_agino != NULLAGINO &&
+	     !xfs_verify_agino(mp, agno, next_agino))) {
+		error = -EFSCORRUPTED;
+		goto out_unlock;
+	}
 
-	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
+	if (next_agino != NULLAGINO) {
 		/*
 		 * There is already another inode in the bucket we need
 		 * to add ourselves to.  Add us at the front of the list.
@@ -2054,18 +2061,18 @@ xfs_iunlink_remove(
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
 	 */
-	if (!xfs_verify_agino(mp, agno,
-			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(mp, agno, next_agino)) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
 		error = -EFSCORRUPTED;
 		goto out_unlock;
 	}
 
-	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
+	if (next_agino == agino) {
 		/*
 		 * We're at the head of the list.  Get the inode's on-disk
 		 * buffer to see if there is anyone after us on the list.
@@ -2107,7 +2114,6 @@ xfs_iunlink_remove(
 		/*
 		 * We need to search the list for the inode being freed.
 		 */
-		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
 		last_ibp = NULL;
 		while (next_agino != agino) {
 			struct xfs_imap	imap;

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 5/8] xfs: refactor inode unlinked pointer update functions
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (3 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
  2019-02-02 16:27   ` Christoph Hellwig
  2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Hoist the functions that update an inode's unlinked pointer updates into
a helper.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |  188 +++++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_trace.h |   26 +++++++
 2 files changed, 124 insertions(+), 90 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4ddda3f3255f..ea38b66fbc59 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1920,6 +1920,88 @@ xfs_iunlink_update_bucket(
 	return 0;
 }
 
+/* Set an on-disk inode's unlinked pointer and return the old value. */
+STATIC void
+xfs_iunlink_update_dinode(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*ibp,
+	struct xfs_dinode	*dip,
+	struct xfs_imap		*imap,
+	xfs_ino_t		ino,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			offset;
+
+	ASSERT(new_agino == NULLAGINO ||
+	       xfs_verify_agino(mp, XFS_INO_TO_AGNO(mp, ino), new_agino));
+
+	trace_xfs_iunlink_update_dinode(mp,
+			XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGINO(mp, ino),
+			be32_to_cpu(dip->di_next_unlinked), new_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(new_agino);
+	offset = imap->im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	/* need to recalc the inode CRC if appropriate */
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1));
+	xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_agino_t		new_agino,
+	xfs_agino_t		*old_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		old_value;
+	int			error;
+
+	ASSERT(new_agino == NULLAGINO || xfs_verify_agino(mp, agno, new_agino));
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+	if (error)
+		return error;
+
+	/* Make sure the old pointer isn't garbage. */
+	old_value = be32_to_cpu(dip->di_next_unlinked);
+	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+	*old_agino = old_value;
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	*old_agino = old_value;
+	if (old_value == new_agino) {
+		if (new_agino != NULLAGINO)
+			error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Ok, update the new pointer. */
+	xfs_iunlink_update_dinode(tp, ibp, dip, &ip->i_imap, ip->i_ino,
+			new_agino);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
 /*
  * This is called when the inode's link count goes to 0 or we are creating a
  * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
@@ -1937,15 +2019,12 @@ xfs_iunlink(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agi		*agi;
-	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*ibp;
 	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 	xfs_agino_t		next_agino;
 	xfs_agino_t		agino;
 	short			bucket_index;
-	int			offset;
 	int			error;
 
 	ASSERT(VFS_I(ip)->i_mode != 0);
@@ -1980,29 +2059,17 @@ xfs_iunlink(
 	}
 
 	if (next_agino != NULLAGINO) {
+		xfs_agino_t	old_agino;
+
 		/*
 		 * There is already another inode in the bucket we need
 		 * to add ourselves to.  Add us at the front of the list.
-		 * Here we put the head pointer into our next pointer,
-		 * and then we fall through to point the head at us.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
+		error = xfs_iunlink_update_inode(tp, ip, next_agino,
+				&old_agino);
 		if (error)
 			goto out_unlock;
-
-		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
-		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-		offset = ip->i_imap.im_boffset +
-			offsetof(xfs_dinode_t, di_next_unlinked);
-
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, dip);
-
-		xfs_trans_inode_buf(tp, ibp);
-		xfs_trans_log_buf(tp, ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, ibp);
+		ASSERT(old_agino == NULLAGINO);
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2027,9 +2094,7 @@ xfs_iunlink_remove(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agi		*agi;
-	struct xfs_dinode	*dip;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*ibp;
 	struct xfs_buf		*last_ibp;
 	struct xfs_dinode	*last_dip = NULL;
 	struct xfs_perag	*pag;
@@ -2038,8 +2103,6 @@ xfs_iunlink_remove(
 	xfs_agino_t		agino;
 	xfs_agino_t		next_agino;
 	short			bucket_index;
-	int			offset;
-	int			last_offset = 0;
 	int			error;
 
 	if (!xfs_verify_ino(mp, ip->i_ino))
@@ -2076,34 +2139,11 @@ xfs_iunlink_remove(
 		/*
 		 * We're at the head of the list.  Get the inode's on-disk
 		 * buffer to see if there is anyone after us on the list.
-		 * Only modify our next pointer if it is not already NULLAGINO.
-		 * This saves us the overhead of dealing with the buffer when
-		 * there is no need to change it.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-				__func__, error);
+		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
+				&next_agino);
+		if (error)
 			goto out_unlock;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
-
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
 
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
@@ -2111,13 +2151,13 @@ xfs_iunlink_remove(
 		if (error)
 			goto out_unlock;
 	} else {
+		struct xfs_imap	imap;
+
 		/*
 		 * We need to search the list for the inode being freed.
 		 */
 		last_ibp = NULL;
 		while (next_agino != agino) {
-			struct xfs_imap	imap;
-
 			if (last_ibp)
 				xfs_trans_brelse(tp, last_ibp);
 
@@ -2141,7 +2181,6 @@ xfs_iunlink_remove(
 				goto out_unlock;
 			}
 
-			last_offset = imap.im_boffset;
 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
 			if (!xfs_verify_agino(mp, agno, next_agino)) {
 				XFS_CORRUPTION_ERROR(__func__,
@@ -2156,45 +2195,14 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on the
 		 * unlinked list.  Pull us from the list.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
-				__func__, error);
+		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
+				&next_agino);
+		if (error)
 			goto out_unlock;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
-
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
-		/*
-		 * Point the previous inode on the list to the next inode.
-		 */
-		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
-		ASSERT(next_agino != 0);
-		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, last_dip);
 
-		xfs_trans_inode_buf(tp, last_ibp);
-		xfs_trans_log_buf(tp, last_ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, last_ibp);
+		/* Point the previous inode on the list to the next inode. */
+		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
+				next_ino, next_agino);
 	}
 	pag->pagi_unlinked_count--;
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c10478e7e49a..fbec8f0e1a9a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
 		  __entry->new_ptr)
 );
 
+TRACE_EVENT(xfs_iunlink_update_dinode,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (4 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 5/8] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
                     ` (2 more replies)
  2019-01-31 23:18 ` [PATCH 7/8] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
                   ` (2 subsequent siblings)
  8 siblings, 3 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

There's a loop that searches an unlinked bucket list to find the inode
that points to a given inode.  Hoist this into a separate function;
later we'll use our iunlink backref cache to bypass the slow list
operation.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |  136 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 99 insertions(+), 37 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ea38b66fbc59..d5b3f8fdac7e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2084,6 +2084,100 @@ xfs_iunlink(
 	return error;
 }
 
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+	struct xfs_trans	*tp,
+	xfs_ino_t		next_ino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error;
+
+	imap->im_blkno = 0;
+	error = xfs_imap(mp, tp, next_ino, imap, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_ino.  Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_agino_t		head_agino,
+	xfs_agino_t		target_agino,
+	xfs_ino_t		*ino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_imap		last_imap;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*last_ibp = NULL;
+	struct xfs_dinode	*last_dip;
+	xfs_ino_t		next_ino = NULLFSINO;
+	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(head_agino != target_agino);
+
+	next_agino = head_agino;
+	while (next_agino != target_agino) {
+		xfs_agino_t	unlinked_agino;
+
+		if (last_ibp)
+			xfs_trans_brelse(tp, last_ibp);
+
+		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
+				&last_dip, &last_ibp);
+		if (error)
+			return error;
+
+		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
+		if (!xfs_verify_agino(mp, pag->pag_agno, next_agino) ||
+		    next_agino == unlinked_agino) {
+			XFS_CORRUPTION_ERROR(__func__,
+					XFS_ERRLEVEL_LOW, mp,
+					last_dip, sizeof(*last_dip));
+			error = -EFSCORRUPTED;
+			return error;
+		}
+		next_agino = unlinked_agino;
+	}
+
+	/* Should never happen, but don't return garbage. */
+	if (next_ino == NULLFSINO)
+		return -EFSCORRUPTED;
+
+	*ino = next_ino;
+	memcpy(imap, &last_imap, sizeof(last_imap));
+	*dipp = last_dip;
+	*bpp = last_ibp;
+	return 0;
+}
+
 /*
  * Pull the on-disk inode from the AGI unlinked list.
  */
@@ -2153,43 +2247,11 @@ xfs_iunlink_remove(
 	} else {
 		struct xfs_imap	imap;
 
-		/*
-		 * We need to search the list for the inode being freed.
-		 */
-		last_ibp = NULL;
-		while (next_agino != agino) {
-			if (last_ibp)
-				xfs_trans_brelse(tp, last_ibp);
-
-			imap.im_blkno = 0;
-			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
-
-			error = xfs_imap(mp, tp, next_ino, &imap, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap returned error %d.",
-					 __func__, error);
-				goto out_unlock;
-			}
-
-			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
-					       &last_ibp, 0, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap_to_bp returned error %d.",
-					__func__, error);
-				goto out_unlock;
-			}
-
-			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
-			if (!xfs_verify_agino(mp, agno, next_agino)) {
-				XFS_CORRUPTION_ERROR(__func__,
-						XFS_ERRLEVEL_LOW, mp,
-						last_dip, sizeof(*last_dip));
-				error = -EFSCORRUPTED;
-				goto out_unlock;
-			}
-		}
+		/* We need to search the list for the inode being freed. */
+		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
+				&next_ino, &imap, &last_dip, &last_ibp);
+		if (error)
+			goto out_unlock;
 
 		/*
 		 * Now last_ibp points to the buffer previous to us on the

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 7/8] xfs: add tracepoints for high level iunlink operations
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (5 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
  2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  2019-02-01  7:57 ` [PATCH 0/8] xfs: incore unlinked list Christoph Hellwig
  8 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add tracepoints so we can associate high level operations with low level
updates.  No functional changes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |    4 ++++
 fs/xfs/xfs_trace.h |    3 +++
 2 files changed, 7 insertions(+)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d5b3f8fdac7e..56349497d75b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2030,6 +2030,8 @@ xfs_iunlink(
 	ASSERT(VFS_I(ip)->i_mode != 0);
 	ASSERT(xfs_verify_ino(mp, ip->i_ino));
 
+	trace_xfs_iunlink(ip);
+
 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
@@ -2199,6 +2201,8 @@ xfs_iunlink_remove(
 	short			bucket_index;
 	int			error;
 
+	trace_xfs_iunlink_remove(ip);
+
 	if (!xfs_verify_ino(mp, ip->i_ino))
 		return -EFSCORRUPTED;
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fbec8f0e1a9a..22d4729143b5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3423,6 +3423,9 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
 		  __entry->new_ptr)
 );
 
+DEFINE_INODE_EVENT(xfs_iunlink);
+DEFINE_INODE_EVENT(xfs_iunlink_remove);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (6 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 7/8] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
@ 2019-01-31 23:18 ` Darrick J. Wong
  2019-02-01  8:03   ` Christoph Hellwig
                     ` (2 more replies)
  2019-02-01  7:57 ` [PATCH 0/8] xfs: incore unlinked list Christoph Hellwig
  8 siblings, 3 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-01-31 23:18 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Use a rhashtable to cache the unlinked list incore.  This should speed
up unlinked processing considerably when there are a lot of inodes on
the unlinked list because iunlink_remove no longer has to traverse an
entire bucket list to find which inode points to the one being removed.

The incore list structure records "X.next_unlinked = Y" relations, with
the rhashtable using Y to index the records.  This makes finding the
inode X that points to a inode Y very quick.  If our cache fails to find
anything we can always fall back on the old method.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h       |   10 ++
 fs/xfs/xfs_log_recover.c |   12 ++-
 fs/xfs/xfs_mount.c       |    7 +
 fs/xfs/xfs_mount.h       |    1 
 5 files changed, 245 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 56349497d75b..eec3c6109fc6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1880,6 +1880,189 @@ xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * Faster In-Core Unlinked List Lookups
+ * ====================================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+	struct rhash_head	iu_rhash_head;
+	xfs_agino_t		iu_agino;
+	xfs_agino_t		iu_next_unlinked;
+};
+
+struct xfs_iunlink_key {
+	xfs_agino_t		iu_next_unlinked;
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+_xfs_iunlink_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_iunlink_key	*key = arg->key;
+	const struct xfs_iunlink	*iu = obj;
+
+	if (iu->iu_next_unlinked != key->iu_next_unlinked)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
+	.nelem_hint		= 512,
+	.key_len		= sizeof(xfs_agino_t),
+	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
+	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_iunlink_obj_cmp,
+};
+
+/*
+ * Return X (in @prev_agino), where X.next_unlinked == @agino.  Returns -ENOENT
+ * if no such relation is found.
+ */
+int
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		*prev_agino)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+	*prev_agino = iu->iu_agino;
+	return 0;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+	iu->iu_agino = prev_agino;
+	iu->iu_next_unlinked = this_agino;
+
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/* Forget that X.next_unlinked = @agino. */
+int
+xfs_iunlink_forget_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	kmem_free(iu);
+	return 0;
+}
+
+
+/* Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. */
+int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	iu->iu_next_unlinked = next_unlinked;
+
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+	struct xfs_perag	*pag)
+{
+	return rhashtable_init(&pag->pagi_unlinked_hash,
+			&xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+	void			*ptr,
+	void			*arg)
+{
+	struct xfs_iunlink	*iu = ptr;
+	bool			*freed_anything = arg;
+
+	*freed_anything = true;
+	kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+	struct xfs_perag	*pag)
+{
+	bool			freed_anything = false;
+
+	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+			xfs_iunlink_free_item, &freed_anything);
+
+	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
  * is responsible for validating the old value.
@@ -2072,6 +2255,9 @@ xfs_iunlink(
 		if (error)
 			goto out_unlock;
 		ASSERT(old_agino == NULLAGINO);
+		error = xfs_iunlink_add_backref(pag, agino, next_agino);
+		if (error)
+			goto out_unlock;
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2144,6 +2330,17 @@ xfs_iunlink_map_prev(
 
 	ASSERT(head_agino != target_agino);
 
+	/* See if our backref cache can find it faster. */
+	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
+	if (error == 0) {
+		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
+				&last_dip, &last_ibp);
+		if (error)
+			return error;
+		goto out;
+	}
+
 	next_agino = head_agino;
 	while (next_agino != target_agino) {
 		xfs_agino_t	unlinked_agino;
@@ -2169,6 +2366,7 @@ xfs_iunlink_map_prev(
 		next_agino = unlinked_agino;
 	}
 
+out:
 	/* Should never happen, but don't return garbage. */
 	if (next_ino == NULLFSINO)
 		return -EFSCORRUPTED;
@@ -2243,6 +2441,12 @@ xfs_iunlink_remove(
 		if (error)
 			goto out_unlock;
 
+		if (next_agino != NULLAGINO) {
+			error = xfs_iunlink_forget_backref(pag, next_agino);
+			if (error)
+				goto out_unlock;
+		}
+
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
 				next_agino);
@@ -2265,10 +2469,22 @@ xfs_iunlink_remove(
 				&next_agino);
 		if (error)
 			goto out_unlock;
+		if (next_agino != NULLAGINO) {
+			error = xfs_iunlink_forget_backref(pag, next_agino);
+			if (error)
+				goto out_unlock;
+		}
 
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
 				next_ino, next_agino);
+		if (next_agino == NULLAGINO)
+			error = xfs_iunlink_forget_backref(pag, agino);
+		else
+			error = xfs_iunlink_change_backref(pag, agino,
+					next_agino);
+		if (error)
+			goto out_unlock;
 	}
 	pag->pagi_unlinked_count--;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..9690f0d32e33 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,14 @@ extern struct kmem_zone	*xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
+		xfs_agino_t *prev_agino);
+int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
+int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c920b8aeba01..909b70550aa8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5078,12 +5078,22 @@ xlog_recover_process_one_iunlink(
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 
-	/* Make sure the in-core data knows about this unlinked inode. */
+	/*
+	 * Make sure the in-core data knows about this unlinked inode.  Since
+	 * our iunlinks recovery basically just deletes the head of a bucket
+	 * list until the bucket is empty, we need only to add the backref from
+	 * the current list item to the next one, if this isn't the list tail.
+	 */
 	pag = xfs_perag_get(mp, agno);
 	mutex_lock(&pag->pagi_unlinked_lock);
 	pag->pagi_unlinked_count++;
+	if (agino != NULLAGINO)
+		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
+				agino);
 	mutex_unlock(&pag->pagi_unlinked_lock);
 	xfs_perag_put(pag);
+	if (error)
+		goto fail_iput;
 
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6bfc985669e0..4eb97ddc915e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -151,6 +151,7 @@ xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		ASSERT(pag->pagi_unlinked_count == 0 ||
 		       XFS_FORCED_SHUTDOWN(mp));
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pagi_unlinked_lock);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
@@ -231,6 +232,9 @@ xfs_initialize_perag(
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
 		mutex_init(&pag->pagi_unlinked_lock);
+		error = xfs_iunlink_init(pag);
+		if (error)
+			goto out_iunlink_mutex;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -241,6 +245,8 @@ xfs_initialize_perag(
 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
 	return 0;
 
+out_iunlink_mutex:
+	mutex_destroy(&pag->pagi_unlinked_lock);
 out_hash_destroy:
 	xfs_buf_hash_destroy(pag);
 out_free_pag:
@@ -253,6 +259,7 @@ xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pagi_unlinked_lock);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0fcc6b6a4f67..27a433e93d32 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -392,6 +392,7 @@ typedef struct xfs_perag {
 	/* unlinked inodes */
 	struct mutex		pagi_unlinked_lock;
 	uint32_t		pagi_unlinked_count;
+	struct rhashtable	pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/8] xfs: incore unlinked list
  2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
                   ` (7 preceding siblings ...)
  2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
@ 2019-02-01  7:57 ` Christoph Hellwig
  8 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-01  7:57 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:17:50PM -0800, Darrick J. Wong wrote:
> If you're going to start using this mess, you probably ought to just
> pull from my git trees: kernel[1], xfsprogs[2], and xfstests[3].

Why would we need updated xfsprogs for this?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] xfs: clean up iunlink functions
  2019-01-31 23:17 ` [PATCH 1/8] xfs: clean up iunlink functions Darrick J. Wong
@ 2019-02-01  8:01   ` Christoph Hellwig
  2019-02-02 19:15     ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-01  8:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:17:56PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Fix some indentation issues with the iunlink functions and reorganize
> the tops of the functions to be identical.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |   77 +++++++++++++++++++++++++++-------------------------
>  1 file changed, 40 insertions(+), 37 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index c8bf02be0003..d18354517320 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1892,26 +1892,32 @@ xfs_inactive(
>   */
>  STATIC int
>  xfs_iunlink(
> -	struct xfs_trans *tp,
> -	struct xfs_inode *ip)
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip)
>  {
> -	xfs_mount_t	*mp = tp->t_mountp;
> -	xfs_agi_t	*agi;
> -	xfs_dinode_t	*dip;
> -	xfs_buf_t	*agibp;
> -	xfs_buf_t	*ibp;
> -	xfs_agino_t	agino;
> -	short		bucket_index;
> -	int		offset;
> -	int		error;
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_agi		*agi;
> +	struct xfs_dinode	*dip;
> +	struct xfs_buf		*agibp;
> +	struct xfs_buf		*ibp;
> +	xfs_agnumber_t		agno;
> +	xfs_agino_t		agino;
> +	short			bucket_index;
> +	int			offset;
> +	int			error;
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
> +	ASSERT(xfs_verify_ino(mp, ip->i_ino));

This verify_ino calls seems odd given that we pass in an initialized
inode struct..

> +
> +	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);

And then we could move these initializations up int othe lines with
the variable declarations.

> +	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
>  
>  	/*
>  	 * Get the agi buffer first.  It ensures lock ordering
>  	 * on the list.
>  	 */

If you are touching this anyway, the whole comment could be changed
into a single line..

> +	if (!xfs_verify_ino(mp, ip->i_ino))
> +		return -EFSCORRUPTED;
>  
> -	mp = tp->t_mountp;
>  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);

Same here.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
@ 2019-02-01  8:03   ` Christoph Hellwig
  2019-02-01 23:59     ` Dave Chinner
  2019-02-01 19:29   ` Brian Foster
  2019-02-02 17:01   ` Christoph Hellwig
  2 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-01  8:03 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.

This sounds pretty reasonable and a real review will follow.  But can
you quantify the considerably speedups for real life workloads?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/8] xfs: track unlinked inode counts in per-ag data
  2019-01-31 23:18 ` [PATCH 2/8] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
@ 2019-02-01 18:59   ` Brian Foster
  2019-02-01 19:33     ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 18:59 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:03PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Track the number of unlinked inodes in each AG so that we can use these
> decisions to throttle inactivations when the unlinked list gets long.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c       |   40 +++++++++++++++++++++++++++++-----------
>  fs/xfs/xfs_log_recover.c |    8 ++++++++
>  fs/xfs/xfs_mount.c       |    5 +++++
>  fs/xfs/xfs_mount.h       |    4 ++++
>  4 files changed, 46 insertions(+), 11 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index d18354517320..98355f5f9253 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1900,6 +1900,7 @@ xfs_iunlink(
>  	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
>  	struct xfs_buf		*ibp;
> +	struct xfs_perag	*pag;
>  	xfs_agnumber_t		agno;
>  	xfs_agino_t		agino;
>  	short			bucket_index;
> @@ -1912,6 +1913,8 @@ xfs_iunlink(
>  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> +	pag = xfs_perag_get(mp, agno);
> +	mutex_lock(&pag->pagi_unlinked_lock);

Any particular reason for using a mutex over a spinlock or atomic_t?

Brian

>  
>  	/*
>  	 * Get the agi buffer first.  It ensures lock ordering
> @@ -1919,7 +1922,7 @@ xfs_iunlink(
>  	 */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
> -		return error;
> +		goto out_unlock;
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> @@ -1939,7 +1942,7 @@ xfs_iunlink(
>  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
>  				       0, 0);
>  		if (error)
> -			return error;
> +			goto out_unlock;
>  
>  		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
>  		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> @@ -1964,7 +1967,12 @@ xfs_iunlink(
>  		(sizeof(xfs_agino_t) * bucket_index);
>  	xfs_trans_log_buf(tp, agibp, offset,
>  			  (offset + sizeof(xfs_agino_t) - 1));
> -	return 0;
> +	pag->pagi_unlinked_count++;
> +
> +out_unlock:
> +	mutex_unlock(&pag->pagi_unlinked_lock);
> +	xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> @@ -1982,6 +1990,7 @@ xfs_iunlink_remove(
>  	struct xfs_buf		*ibp;
>  	struct xfs_buf		*last_ibp;
>  	struct xfs_dinode	*last_dip = NULL;
> +	struct xfs_perag	*pag;
>  	xfs_ino_t		next_ino;
>  	xfs_agnumber_t		agno;
>  	xfs_agino_t		agino;
> @@ -1997,6 +2006,8 @@ xfs_iunlink_remove(
>  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> +	pag = xfs_perag_get(mp, agno);
> +	mutex_lock(&pag->pagi_unlinked_lock);
>  
>  	/*
>  	 * Get the agi buffer first.  It ensures lock ordering
> @@ -2004,7 +2015,7 @@ xfs_iunlink_remove(
>  	 */
>  	error = xfs_read_agi(mp, tp, agno, &agibp);
>  	if (error)
> -		return error;
> +		goto out_unlock;
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> @@ -2015,7 +2026,8 @@ xfs_iunlink_remove(
>  			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
>  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
>  				agi, sizeof(*agi));
> -		return -EFSCORRUPTED;
> +		error = -EFSCORRUPTED;
> +		goto out_unlock;
>  	}
>  
>  	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
> @@ -2031,7 +2043,7 @@ xfs_iunlink_remove(
>  		if (error) {
>  			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
>  				__func__, error);
> -			return error;
> +			goto out_unlock;
>  		}
>  		next_agino = be32_to_cpu(dip->di_next_unlinked);
>  		ASSERT(next_agino != 0);
> @@ -2080,7 +2092,7 @@ xfs_iunlink_remove(
>  				xfs_warn(mp,
>  	"%s: xfs_imap returned error %d.",
>  					 __func__, error);
> -				return error;
> +				goto out_unlock;
>  			}
>  
>  			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> @@ -2089,7 +2101,7 @@ xfs_iunlink_remove(
>  				xfs_warn(mp,
>  	"%s: xfs_imap_to_bp returned error %d.",
>  					__func__, error);
> -				return error;
> +				goto out_unlock;
>  			}
>  
>  			last_offset = imap.im_boffset;
> @@ -2098,7 +2110,8 @@ xfs_iunlink_remove(
>  				XFS_CORRUPTION_ERROR(__func__,
>  						XFS_ERRLEVEL_LOW, mp,
>  						last_dip, sizeof(*last_dip));
> -				return -EFSCORRUPTED;
> +				error = -EFSCORRUPTED;
> +				goto out_unlock;
>  			}
>  		}
>  
> @@ -2111,7 +2124,7 @@ xfs_iunlink_remove(
>  		if (error) {
>  			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
>  				__func__, error);
> -			return error;
> +			goto out_unlock;
>  		}
>  		next_agino = be32_to_cpu(dip->di_next_unlinked);
>  		ASSERT(next_agino != 0);
> @@ -2146,7 +2159,12 @@ xfs_iunlink_remove(
>  				  (offset + sizeof(xfs_agino_t) - 1));
>  		xfs_inobp_check(mp, last_ibp);
>  	}
> -	return 0;
> +	pag->pagi_unlinked_count--;
> +
> +out_unlock:
> +	mutex_unlock(&pag->pagi_unlinked_lock);
> +	xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index ff9a27834c50..c920b8aeba01 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
>  	struct xfs_buf			*ibp;
>  	struct xfs_dinode		*dip;
>  	struct xfs_inode		*ip;
> +	struct xfs_perag		*pag;
>  	xfs_ino_t			ino;
>  	int				error;
>  
> @@ -5077,6 +5078,13 @@ xlog_recover_process_one_iunlink(
>  	agino = be32_to_cpu(dip->di_next_unlinked);
>  	xfs_buf_relse(ibp);
>  
> +	/* Make sure the in-core data knows about this unlinked inode. */
> +	pag = xfs_perag_get(mp, agno);
> +	mutex_lock(&pag->pagi_unlinked_lock);
> +	pag->pagi_unlinked_count++;
> +	mutex_unlock(&pag->pagi_unlinked_lock);
> +	xfs_perag_put(pag);
> +
>  	/*
>  	 * Prevent any DMAPI event from being sent when the reference on
>  	 * the inode is dropped.
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 10be706ec72e..6bfc985669e0 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -149,6 +149,9 @@ xfs_free_perag(
>  		spin_unlock(&mp->m_perag_lock);
>  		ASSERT(pag);
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> +		ASSERT(pag->pagi_unlinked_count == 0 ||
> +		       XFS_FORCED_SHUTDOWN(mp));
> +		mutex_destroy(&pag->pagi_unlinked_lock);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> @@ -227,6 +230,7 @@ xfs_initialize_perag(
>  		/* first new pag is fully initialized */
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
> +		mutex_init(&pag->pagi_unlinked_lock);
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> @@ -249,6 +253,7 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> +		mutex_destroy(&pag->pagi_unlinked_lock);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
>  	}
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index e344b1dfde63..0fcc6b6a4f67 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -388,6 +388,10 @@ typedef struct xfs_perag {
>  
>  	/* reference count */
>  	uint8_t			pagf_refcount_level;
> +
> +	/* unlinked inodes */
> +	struct mutex		pagi_unlinked_lock;
> +	uint32_t		pagi_unlinked_count;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] xfs: refactor AGI unlinked bucket updates
  2019-01-31 23:18 ` [PATCH 3/8] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
@ 2019-02-01 19:00   ` Brian Foster
  2019-02-02 19:50     ` Darrick J. Wong
  2019-02-02 16:21   ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:00 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:09PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Split the AGI unlinked bucket updates into a separate function.  No
> functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |   69 ++++++++++++++++++++++++++++++++++++++--------------
>  fs/xfs/xfs_trace.h |   26 ++++++++++++++++++++
>  2 files changed, 76 insertions(+), 19 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 98355f5f9253..3c33136b21ef 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1880,6 +1880,46 @@ xfs_inactive(
>  	xfs_qm_dqdetach(ip);
>  }
>  
> +/*
> + * Point the AGI unlinked bucket at an inode and log the results.  The caller
> + * is responsible for validating the old value.
> + */
> +STATIC int
> +xfs_iunlink_update_bucket(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agibp,
> +	unsigned int		bucket_index,
> +	xfs_agino_t		new_agino)
> +{
> +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
> +	xfs_agino_t		old_value;
> +	int			offset;
> +
> +	ASSERT(new_agino == NULLAGINO ||
> +	       xfs_verify_agino(tp->t_mountp, be32_to_cpu(agi->agi_seqno),
> +				new_agino));
> +
> +	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	trace_xfs_iunlink_update_bucket(tp->t_mountp,
> +			be32_to_cpu(agi->agi_seqno), bucket_index, old_value,
> +			new_agino);
> +

Might as well pass agno as a param, particularly since the callers
already have it.

> +	/*
> +	 * We should never find the head of the list already set to the value
> +	 * passed in because either we're adding or removing ourselves from the
> +	 * head of the list.
> +	 */
> +	if (old_value == new_agino)
> +		return -EFSCORRUPTED;
> +

This seems a little weird in the NULLAGINO case. That probably should
still never happen, but doesn't seem indicative of corruption on its
own. *shrug* not a big deal.

Brian

> +	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
> +	offset = offsetof(struct xfs_agi, agi_unlinked) +
> +			(sizeof(xfs_agino_t) * bucket_index);
> +	xfs_trans_log_buf(tp, agibp, offset,
> +			(offset + sizeof(xfs_agino_t) - 1));
> +	return 0;
> +}
> +
>  /*
>   * This is called when the inode's link count goes to 0 or we are creating a
>   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> @@ -1958,15 +1998,10 @@ xfs_iunlink(
>  		xfs_inobp_check(mp, ibp);
>  	}
>  
> -	/*
> -	 * Point the bucket head pointer at the inode being inserted.
> -	 */
> -	ASSERT(agino != 0);
> -	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
> -	offset = offsetof(xfs_agi_t, agi_unlinked) +
> -		(sizeof(xfs_agino_t) * bucket_index);
> -	xfs_trans_log_buf(tp, agibp, offset,
> -			  (offset + sizeof(xfs_agino_t) - 1));
> +	/* Point the head of the list to point to this inode. */
> +	error = xfs_iunlink_update_bucket(tp, agibp, bucket_index, agino);
> +	if (error)
> +		goto out_unlock;
>  	pag->pagi_unlinked_count++;
>  
>  out_unlock:
> @@ -2062,16 +2097,12 @@ xfs_iunlink_remove(
>  		} else {
>  			xfs_trans_brelse(tp, ibp);
>  		}
> -		/*
> -		 * Point the bucket head pointer at the next inode.
> -		 */
> -		ASSERT(next_agino != 0);
> -		ASSERT(next_agino != agino);
> -		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
> -		offset = offsetof(xfs_agi_t, agi_unlinked) +
> -			(sizeof(xfs_agino_t) * bucket_index);
> -		xfs_trans_log_buf(tp, agibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> +
> +		/* Point the head of the list to the next unlinked inode. */
> +		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> +				next_agino);
> +		if (error)
> +			goto out_unlock;
>  	} else {
>  		/*
>  		 * We need to search the list for the inode being freed.
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 6fcc893dfc91..c10478e7e49a 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
>  DEFINE_TRANS_EVENT(xfs_trans_add_item);
>  DEFINE_TRANS_EVENT(xfs_trans_free_items);
>  
> +TRACE_EVENT(xfs_iunlink_update_bucket,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
> +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> +	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(unsigned int, bucket)
> +		__field(xfs_agino_t, old_ptr)
> +		__field(xfs_agino_t, new_ptr)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->bucket = bucket;
> +		__entry->old_ptr = old_ptr;
> +		__entry->new_ptr = new_ptr;
> +	),
> +	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno,
> +		  __entry->bucket,
> +		  __entry->old_ptr,
> +		  __entry->new_ptr)
> +);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks
  2019-01-31 23:18 ` [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
@ 2019-02-01 19:00   ` Brian Foster
  2019-02-02 16:22   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:00 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:15PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Strengthen our checking of the AGI unlinked pointers when we start to
> use them for updating the metadata.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_inode.c |   28 +++++++++++++++++-----------
>  1 file changed, 17 insertions(+), 11 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 3c33136b21ef..4ddda3f3255f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1942,6 +1942,7 @@ xfs_iunlink(
>  	struct xfs_buf		*ibp;
>  	struct xfs_perag	*pag;
>  	xfs_agnumber_t		agno;
> +	xfs_agino_t		next_agino;
>  	xfs_agino_t		agino;
>  	short			bucket_index;
>  	int			offset;
> @@ -1966,13 +1967,19 @@ xfs_iunlink(
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> -	 * Get the index into the agi hash table for the
> -	 * list this inode will go on.
> +	 * Get the index into the agi hash table for the list this inode will
> +	 * go on.  Make sure the pointer isn't garbage and that this inode
> +	 * isn't already on the list.
>  	 */
> -	ASSERT(agi->agi_unlinked[bucket_index]);
> -	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
> +	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	if (next_agino == agino ||
> +	    (next_agino != NULLAGINO &&
> +	     !xfs_verify_agino(mp, agno, next_agino))) {
> +		error = -EFSCORRUPTED;
> +		goto out_unlock;
> +	}
>  
> -	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
> +	if (next_agino != NULLAGINO) {
>  		/*
>  		 * There is already another inode in the bucket we need
>  		 * to add ourselves to.  Add us at the front of the list.
> @@ -2054,18 +2061,18 @@ xfs_iunlink_remove(
>  	agi = XFS_BUF_TO_AGI(agibp);
>  
>  	/*
> -	 * Get the index into the agi hash table for the
> -	 * list this inode will go on.
> +	 * Get the index into the agi hash table for the list this inode will
> +	 * go on.  Make sure the head pointer isn't garbage.
>  	 */
> -	if (!xfs_verify_agino(mp, agno,
> -			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
> +	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	if (!xfs_verify_agino(mp, agno, next_agino)) {
>  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
>  				agi, sizeof(*agi));
>  		error = -EFSCORRUPTED;
>  		goto out_unlock;
>  	}
>  
> -	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
> +	if (next_agino == agino) {
>  		/*
>  		 * We're at the head of the list.  Get the inode's on-disk
>  		 * buffer to see if there is anyone after us on the list.
> @@ -2107,7 +2114,6 @@ xfs_iunlink_remove(
>  		/*
>  		 * We need to search the list for the inode being freed.
>  		 */
> -		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
>  		last_ibp = NULL;
>  		while (next_agino != agino) {
>  			struct xfs_imap	imap;
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 5/8] xfs: refactor inode unlinked pointer update functions
  2019-01-31 23:18 ` [PATCH 5/8] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
@ 2019-02-01 19:01   ` Brian Foster
  2019-02-02 22:00     ` Darrick J. Wong
  2019-02-02 16:27   ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:21PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Hoist the functions that update an inode's unlinked pointer updates into
> a helper.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |  188 +++++++++++++++++++++++++++-------------------------
>  fs/xfs/xfs_trace.h |   26 +++++++
>  2 files changed, 124 insertions(+), 90 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4ddda3f3255f..ea38b66fbc59 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1920,6 +1920,88 @@ xfs_iunlink_update_bucket(
>  	return 0;
>  }
>  
> +/* Set an on-disk inode's unlinked pointer and return the old value. */

Doesn't look like this one returns anything.

> +STATIC void
> +xfs_iunlink_update_dinode(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*ibp,
> +	struct xfs_dinode	*dip,
> +	struct xfs_imap		*imap,
> +	xfs_ino_t		ino,
> +	xfs_agino_t		new_agino)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	int			offset;
> +
> +	ASSERT(new_agino == NULLAGINO ||
> +	       xfs_verify_agino(mp, XFS_INO_TO_AGNO(mp, ino), new_agino));
> +
> +	trace_xfs_iunlink_update_dinode(mp,
> +			XFS_INO_TO_AGNO(mp, ino),
> +			XFS_INO_TO_AGINO(mp, ino),
> +			be32_to_cpu(dip->di_next_unlinked), new_agino);
> +
> +	dip->di_next_unlinked = cpu_to_be32(new_agino);
> +	offset = imap->im_boffset +
> +			offsetof(struct xfs_dinode, di_next_unlinked);
> +
> +	/* need to recalc the inode CRC if appropriate */
> +	xfs_dinode_calc_crc(mp, dip);
> +	xfs_trans_inode_buf(tp, ibp);
> +	xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1));
> +	xfs_inobp_check(mp, ibp);
> +}
> +
> +/* Set an in-core inode's unlinked pointer and return the old value. */
> +STATIC int
> +xfs_iunlink_update_inode(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	xfs_agino_t		new_agino,
> +	xfs_agino_t		*old_agino)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_dinode	*dip;
> +	struct xfs_buf		*ibp;
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	xfs_agino_t		old_value;
> +	int			error;
> +
> +	ASSERT(new_agino == NULLAGINO || xfs_verify_agino(mp, agno, new_agino));
> +
> +	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
> +	if (error)
> +		return error;
> +
> +	/* Make sure the old pointer isn't garbage. */
> +	old_value = be32_to_cpu(dip->di_next_unlinked);
> +	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
> +		error = -EFSCORRUPTED;
> +		goto out;
> +	}
> +	*old_agino = old_value;
> +
> +	/*
> +	 * Since we're updating a linked list, we should never find that the
> +	 * current pointer is the same as the new value, unless we're
> +	 * terminating the list.
> +	 */
> +	*old_agino = old_value;

Double assign of *old_agino.

> +	if (old_value == new_agino) {
> +		if (new_agino != NULLAGINO)
> +			error = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	/* Ok, update the new pointer. */
> +	xfs_iunlink_update_dinode(tp, ibp, dip, &ip->i_imap, ip->i_ino,
> +			new_agino);
> +	return 0;
> +out:
> +	xfs_trans_brelse(tp, ibp);
> +	return error;
> +}
> +
>  /*
>   * This is called when the inode's link count goes to 0 or we are creating a
>   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> @@ -1937,15 +2019,12 @@ xfs_iunlink(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> -	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
> -	struct xfs_buf		*ibp;
>  	struct xfs_perag	*pag;
>  	xfs_agnumber_t		agno;
>  	xfs_agino_t		next_agino;
>  	xfs_agino_t		agino;
>  	short			bucket_index;
> -	int			offset;
>  	int			error;
>  
>  	ASSERT(VFS_I(ip)->i_mode != 0);
> @@ -1980,29 +2059,17 @@ xfs_iunlink(
>  	}
>  
>  	if (next_agino != NULLAGINO) {
> +		xfs_agino_t	old_agino;
> +
>  		/*
>  		 * There is already another inode in the bucket we need
>  		 * to add ourselves to.  Add us at the front of the list.
> -		 * Here we put the head pointer into our next pointer,
> -		 * and then we fall through to point the head at us.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> +		error = xfs_iunlink_update_inode(tp, ip, next_agino,
> +				&old_agino);

What's left of the comment above seems a bit misleading wrt to what
we're doing here. Could we say something like "point this inode to the
current head of the list" instead of "add us to the front of the list?"

>  		if (error)
>  			goto out_unlock;
> -
> -		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
> -		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> -		offset = ip->i_imap.im_boffset +
> -			offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -		/* need to recalc the inode CRC if appropriate */
> -		xfs_dinode_calc_crc(mp, dip);
> -
> -		xfs_trans_inode_buf(tp, ibp);
> -		xfs_trans_log_buf(tp, ibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> -		xfs_inobp_check(mp, ibp);
> +		ASSERT(old_agino == NULLAGINO);
>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2027,9 +2094,7 @@ xfs_iunlink_remove(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> -	struct xfs_dinode	*dip;
>  	struct xfs_buf		*agibp;
> -	struct xfs_buf		*ibp;
>  	struct xfs_buf		*last_ibp;
>  	struct xfs_dinode	*last_dip = NULL;
>  	struct xfs_perag	*pag;
> @@ -2038,8 +2103,6 @@ xfs_iunlink_remove(
>  	xfs_agino_t		agino;
>  	xfs_agino_t		next_agino;
>  	short			bucket_index;
> -	int			offset;
> -	int			last_offset = 0;
>  	int			error;
>  
>  	if (!xfs_verify_ino(mp, ip->i_ino))
> @@ -2076,34 +2139,11 @@ xfs_iunlink_remove(
>  		/*
>  		 * We're at the head of the list.  Get the inode's on-disk
>  		 * buffer to see if there is anyone after us on the list.
> -		 * Only modify our next pointer if it is not already NULLAGINO.
> -		 * This saves us the overhead of dealing with the buffer when
> -		 * there is no need to change it.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> -		if (error) {
> -			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> -				__func__, error);
> +		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
> +				&next_agino);
> +		if (error)
>  			goto out_unlock;
> -		}
> -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> -		ASSERT(next_agino != 0);
> -		if (next_agino != NULLAGINO) {
> -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> -			offset = ip->i_imap.im_boffset +
> -				offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -			/* need to recalc the inode CRC if appropriate */
> -			xfs_dinode_calc_crc(mp, dip);
> -
> -			xfs_trans_inode_buf(tp, ibp);
> -			xfs_trans_log_buf(tp, ibp, offset,
> -					  (offset + sizeof(xfs_agino_t) - 1));
> -			xfs_inobp_check(mp, ibp);
> -		} else {
> -			xfs_trans_brelse(tp, ibp);
> -		}
>  
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> @@ -2111,13 +2151,13 @@ xfs_iunlink_remove(
>  		if (error)
>  			goto out_unlock;
>  	} else {
> +		struct xfs_imap	imap;
> +
>  		/*
>  		 * We need to search the list for the inode being freed.
>  		 */
>  		last_ibp = NULL;
>  		while (next_agino != agino) {
> -			struct xfs_imap	imap;
> -
>  			if (last_ibp)
>  				xfs_trans_brelse(tp, last_ibp);
>  
> @@ -2141,7 +2181,6 @@ xfs_iunlink_remove(
>  				goto out_unlock;
>  			}
>  
> -			last_offset = imap.im_boffset;
>  			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
>  			if (!xfs_verify_agino(mp, agno, next_agino)) {
>  				XFS_CORRUPTION_ERROR(__func__,
> @@ -2156,45 +2195,14 @@ xfs_iunlink_remove(
>  		 * Now last_ibp points to the buffer previous to us on the
>  		 * unlinked list.  Pull us from the list.
>  		 */
> -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> -				       0, 0);
> -		if (error) {
> -			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
> -				__func__, error);
> +		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
> +				&next_agino);
> +		if (error)
>  			goto out_unlock;

It looks like the above _update_inode() call is now common across both
branches. We might be able to factor that out a bit further so the
if/else just determines whether we update the bucket or a previous
dinode.

Brian

> -		}
> -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> -		ASSERT(next_agino != 0);
> -		ASSERT(next_agino != agino);
> -		if (next_agino != NULLAGINO) {
> -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> -			offset = ip->i_imap.im_boffset +
> -				offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -			/* need to recalc the inode CRC if appropriate */
> -			xfs_dinode_calc_crc(mp, dip);
> -
> -			xfs_trans_inode_buf(tp, ibp);
> -			xfs_trans_log_buf(tp, ibp, offset,
> -					  (offset + sizeof(xfs_agino_t) - 1));
> -			xfs_inobp_check(mp, ibp);
> -		} else {
> -			xfs_trans_brelse(tp, ibp);
> -		}
> -		/*
> -		 * Point the previous inode on the list to the next inode.
> -		 */
> -		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
> -		ASSERT(next_agino != 0);
> -		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
> -
> -		/* need to recalc the inode CRC if appropriate */
> -		xfs_dinode_calc_crc(mp, last_dip);
>  
> -		xfs_trans_inode_buf(tp, last_ibp);
> -		xfs_trans_log_buf(tp, last_ibp, offset,
> -				  (offset + sizeof(xfs_agino_t) - 1));
> -		xfs_inobp_check(mp, last_ibp);
> +		/* Point the previous inode on the list to the next inode. */
> +		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
> +				next_ino, next_agino);
>  	}
>  	pag->pagi_unlinked_count--;
>  
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index c10478e7e49a..fbec8f0e1a9a 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
>  		  __entry->new_ptr)
>  );
>  
> +TRACE_EVENT(xfs_iunlink_update_dinode,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
> +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> +	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agino_t, agino)
> +		__field(xfs_agino_t, old_ptr)
> +		__field(xfs_agino_t, new_ptr)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->agino = agino;
> +		__entry->old_ptr = old_ptr;
> +		__entry->new_ptr = new_ptr;
> +	),
> +	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno,
> +		  __entry->agino,
> +		  __entry->old_ptr,
> +		  __entry->new_ptr)
> +);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
@ 2019-02-01 19:01   ` Brian Foster
  2019-02-02 20:46     ` Darrick J. Wong
  2019-02-02 16:30   ` Christoph Hellwig
  2019-02-02 16:51   ` Christoph Hellwig
  2 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:27PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> There's a loop that searches an unlinked bucket list to find the inode
> that points to a given inode.  Hoist this into a separate function;
> later we'll use our iunlink backref cache to bypass the slow list
> operation.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |  136 ++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 99 insertions(+), 37 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index ea38b66fbc59..d5b3f8fdac7e 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2084,6 +2084,100 @@ xfs_iunlink(
>  	return error;
>  }
>  
> +/* Return the imap, dinode pointer, and buffer for an inode. */
> +STATIC int
> +xfs_iunlink_map_ino(
> +	struct xfs_trans	*tp,
> +	xfs_ino_t		next_ino,

Might be good to clean up some of these variable names as this code gets
factored out. E.g., "next_ino" doesn't mean much more than "ino" in the
context of this helper.

> +	struct xfs_imap		*imap,
> +	struct xfs_dinode	**dipp,
> +	struct xfs_buf		**bpp)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	int			error;
> +
> +	imap->im_blkno = 0;
> +	error = xfs_imap(mp, tp, next_ino, imap, 0);
> +	if (error) {
> +		xfs_warn(mp, "%s: xfs_imap returned error %d.",
> +				__func__, error);
> +		return error;
> +	}
> +
> +	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
> +	if (error) {
> +		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> +				__func__, error);
> +		return error;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Walk the unlinked chain from @head_agino until we find the inode that
> + * points to @target_ino.  Return the inode number, map, dinode pointer,
> + * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
> + *
> + * Do not call this function if @target_agino is the head of the list.
> + */
> +STATIC int
> +xfs_iunlink_map_prev(
> +	struct xfs_trans	*tp,
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		head_agino,
> +	xfs_agino_t		target_agino,
> +	xfs_ino_t		*ino,
> +	struct xfs_imap		*imap,
> +	struct xfs_dinode	**dipp,
> +	struct xfs_buf		**bpp)
> +{

It would also be nice to have some oneline comments next to the params
of some of these functions.

> +	struct xfs_imap		last_imap;
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_buf		*last_ibp = NULL;
> +	struct xfs_dinode	*last_dip;
> +	xfs_ino_t		next_ino = NULLFSINO;
> +	xfs_agino_t		next_agino;
> +	int			error;
> +
> +	ASSERT(head_agino != target_agino);
> +
> +	next_agino = head_agino;
> +	while (next_agino != target_agino) {
> +		xfs_agino_t	unlinked_agino;
> +
> +		if (last_ibp)
> +			xfs_trans_brelse(tp, last_ibp);
> +
> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> +				&last_dip, &last_ibp);
> +		if (error)
> +			return error;
> +
> +		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
> +		if (!xfs_verify_agino(mp, pag->pag_agno, next_agino) ||

Should the next_agino above be unlinked_agino? Also is the == check
below a loop check (comment pls)?

> +		    next_agino == unlinked_agino) {
> +			XFS_CORRUPTION_ERROR(__func__,
> +					XFS_ERRLEVEL_LOW, mp,
> +					last_dip, sizeof(*last_dip));
> +			error = -EFSCORRUPTED;
> +			return error;
> +		}
> +		next_agino = unlinked_agino;
> +	}
> +
> +	/* Should never happen, but don't return garbage. */
> +	if (next_ino == NULLFSINO)
> +		return -EFSCORRUPTED;
> +
> +	*ino = next_ino;
> +	memcpy(imap, &last_imap, sizeof(last_imap));
> +	*dipp = last_dip;
> +	*bpp = last_ibp;
> +	return 0;
> +}
> +
>  /*
>   * Pull the on-disk inode from the AGI unlinked list.
>   */
> @@ -2153,43 +2247,11 @@ xfs_iunlink_remove(
>  	} else {
>  		struct xfs_imap	imap;
>  
> -		/*
> -		 * We need to search the list for the inode being freed.
> -		 */
> -		last_ibp = NULL;
> -		while (next_agino != agino) {
> -			if (last_ibp)
> -				xfs_trans_brelse(tp, last_ibp);
> -
> -			imap.im_blkno = 0;
> -			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
> -
> -			error = xfs_imap(mp, tp, next_ino, &imap, 0);
> -			if (error) {
> -				xfs_warn(mp,
> -	"%s: xfs_imap returned error %d.",
> -					 __func__, error);
> -				goto out_unlock;
> -			}
> -
> -			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> -					       &last_ibp, 0, 0);
> -			if (error) {
> -				xfs_warn(mp,
> -	"%s: xfs_imap_to_bp returned error %d.",
> -					__func__, error);
> -				goto out_unlock;
> -			}
> -
> -			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
> -			if (!xfs_verify_agino(mp, agno, next_agino)) {
> -				XFS_CORRUPTION_ERROR(__func__,
> -						XFS_ERRLEVEL_LOW, mp,
> -						last_dip, sizeof(*last_dip));
> -				error = -EFSCORRUPTED;
> -				goto out_unlock;
> -			}
> -		}
> +		/* We need to search the list for the inode being freed. */
> +		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
> +				&next_ino, &imap, &last_dip, &last_ibp);

Another variable nit... next_ino is kind of confusing here, particularly
where we still use next_agino as well. The former is actually the "prev"
ino output, right?

Brian

> +		if (error)
> +			goto out_unlock;
>  
>  		/*
>  		 * Now last_ibp points to the buffer previous to us on the
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 7/8] xfs: add tracepoints for high level iunlink operations
  2019-01-31 23:18 ` [PATCH 7/8] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
@ 2019-02-01 19:01   ` Brian Foster
  2019-02-01 19:14     ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:34PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add tracepoints so we can associate high level operations with low level
> updates.  No functional changes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |    4 ++++
>  fs/xfs/xfs_trace.h |    3 +++
>  2 files changed, 7 insertions(+)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index d5b3f8fdac7e..56349497d75b 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2030,6 +2030,8 @@ xfs_iunlink(
>  	ASSERT(VFS_I(ip)->i_mode != 0);
>  	ASSERT(xfs_verify_ino(mp, ip->i_ino));
>  
> +	trace_xfs_iunlink(ip);
> +

It's worth noting that current inode events print a global inode number
(i.e., "ino 0x%llx ...") whereas the new iunlink tracepoints from
previous patches print an agno/agino combination. I haven't looked at
the actual trace output from an unlink with these changes yet, but I do
wonder if there's value in having a more consistent format. Hm?

Brian

>  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> @@ -2199,6 +2201,8 @@ xfs_iunlink_remove(
>  	short			bucket_index;
>  	int			error;
>  
> +	trace_xfs_iunlink_remove(ip);
> +
>  	if (!xfs_verify_ino(mp, ip->i_ino))
>  		return -EFSCORRUPTED;
>  
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index fbec8f0e1a9a..22d4729143b5 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3423,6 +3423,9 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
>  		  __entry->new_ptr)
>  );
>  
> +DEFINE_INODE_EVENT(xfs_iunlink);
> +DEFINE_INODE_EVENT(xfs_iunlink_remove);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 7/8] xfs: add tracepoints for high level iunlink operations
  2019-02-01 19:01   ` Brian Foster
@ 2019-02-01 19:14     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-01 19:14 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 02:01:46PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:34PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add tracepoints so we can associate high level operations with low level
> > updates.  No functional changes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c |    4 ++++
> >  fs/xfs/xfs_trace.h |    3 +++
> >  2 files changed, 7 insertions(+)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index d5b3f8fdac7e..56349497d75b 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -2030,6 +2030,8 @@ xfs_iunlink(
> >  	ASSERT(VFS_I(ip)->i_mode != 0);
> >  	ASSERT(xfs_verify_ino(mp, ip->i_ino));
> >  
> > +	trace_xfs_iunlink(ip);
> > +
> 
> It's worth noting that current inode events print a global inode number
> (i.e., "ino 0x%llx ...") whereas the new iunlink tracepoints from
> previous patches print an agno/agino combination. I haven't looked at
> the actual trace output from an unlink with these changes yet, but I do
> wonder if there's value in having a more consistent format. Hm?

Hmm, I suppose it wouldn't be difficult to add a DEFINE_AG_INODE_EVENT
macrosoup that would make the reporting more consistent for these per-AG
inode operations.

--D

> Brian
> 
> >  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> >  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> > @@ -2199,6 +2201,8 @@ xfs_iunlink_remove(
> >  	short			bucket_index;
> >  	int			error;
> >  
> > +	trace_xfs_iunlink_remove(ip);
> > +
> >  	if (!xfs_verify_ino(mp, ip->i_ino))
> >  		return -EFSCORRUPTED;
> >  
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index fbec8f0e1a9a..22d4729143b5 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3423,6 +3423,9 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> >  		  __entry->new_ptr)
> >  );
> >  
> > +DEFINE_INODE_EVENT(xfs_iunlink);
> > +DEFINE_INODE_EVENT(xfs_iunlink_remove);
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  2019-02-01  8:03   ` Christoph Hellwig
@ 2019-02-01 19:29   ` Brian Foster
  2019-02-01 19:40     ` Darrick J. Wong
  2019-02-02 17:01   ` Christoph Hellwig
  2 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-01 19:29 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.
> 
> The incore list structure records "X.next_unlinked = Y" relations, with
> the rhashtable using Y to index the records.  This makes finding the
> inode X that points to a inode Y very quick.  If our cache fails to find
> anything we can always fall back on the old method.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

I still need to take a closer look at this one, but I want to make sure
I grok what's happening from the unlinked list perspective...

>  fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_inode.h       |   10 ++
>  fs/xfs/xfs_log_recover.c |   12 ++-
>  fs/xfs/xfs_mount.c       |    7 +
>  fs/xfs/xfs_mount.h       |    1 
>  5 files changed, 245 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 56349497d75b..eec3c6109fc6 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
...
> @@ -2072,6 +2255,9 @@ xfs_iunlink(
>  		if (error)
>  			goto out_unlock;
>  		ASSERT(old_agino == NULLAGINO);
> +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> +		if (error)
> +			goto out_unlock;

agino has been unlinked, add a backref from the next inode (i.e., we
point to) back to agino.

>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2144,6 +2330,17 @@ xfs_iunlink_map_prev(
>  
>  	ASSERT(head_agino != target_agino);
>  
> +	/* See if our backref cache can find it faster. */
> +	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
> +	if (error == 0) {
> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> +				&last_dip, &last_ibp);
> +		if (error)
> +			return error;
> +		goto out;
> +	}
> +
>  	next_agino = head_agino;
>  	while (next_agino != target_agino) {
>  		xfs_agino_t	unlinked_agino;
> @@ -2169,6 +2366,7 @@ xfs_iunlink_map_prev(
>  		next_agino = unlinked_agino;
>  	}
>  
> +out:
>  	/* Should never happen, but don't return garbage. */
>  	if (next_ino == NULLFSINO)
>  		return -EFSCORRUPTED;
> @@ -2243,6 +2441,12 @@ xfs_iunlink_remove(
>  		if (error)
>  			goto out_unlock;
>  
> +		if (next_agino != NULLAGINO) {
> +			error = xfs_iunlink_forget_backref(pag, next_agino);
> +			if (error)
> +				goto out_unlock;
> +		}

Removed from the head. We thus had no inode pointing to us, but the next
record had a ref that we pointed to it. Drop that since we're gone and
that inode is now the head.

> +
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
>  				next_agino);
> @@ -2265,10 +2469,22 @@ xfs_iunlink_remove(
>  				&next_agino);
>  		if (error)
>  			goto out_unlock;
> +		if (next_agino != NULLAGINO) {
> +			error = xfs_iunlink_forget_backref(pag, next_agino);
> +			if (error)
> +				goto out_unlock;
> +		}

Removed from the middle. next_agino had a backref to us, so we drop that
since we're going away.

>  
>  		/* Point the previous inode on the list to the next inode. */
>  		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
>  				next_ino, next_agino);
> +		if (next_agino == NULLAGINO)
> +			error = xfs_iunlink_forget_backref(pag, agino);
> +		else
> +			error = xfs_iunlink_change_backref(pag, agino,
> +					next_agino);

Now we deal with the backref for us (i.e., to whatever pointed at us and
now points at something else). If it now points at a real inode, change
the ref that pointed to us to point to our old next. If it doesn't,
delete the ref that pointed to us. Am I following that correctly?

Hmm, I realize this may be what's happening behind these wrappers, but
I'm wondering if the logic at this level would be easier to grok if we
just dropped the backref that points to our ip (IIUC we should always
have one here), if we point to something, drop the backref for that and
then add a new one for the prev->next link we've just created.

But I still need to take a closer look at all of this... it might be
more clear with some comments.

Brian

> +		if (error)
> +			goto out_unlock;
>  	}
>  	pag->pagi_unlinked_count--;
>  
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index be2014520155..9690f0d32e33 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -500,4 +500,14 @@ extern struct kmem_zone	*xfs_inode_zone;
>  
>  bool xfs_inode_verify_forks(struct xfs_inode *ip);
>  
> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> +		xfs_agino_t *prev_agino);
> +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +
>  #endif	/* __XFS_INODE_H__ */
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index c920b8aeba01..909b70550aa8 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5078,12 +5078,22 @@ xlog_recover_process_one_iunlink(
>  	agino = be32_to_cpu(dip->di_next_unlinked);
>  	xfs_buf_relse(ibp);
>  
> -	/* Make sure the in-core data knows about this unlinked inode. */
> +	/*
> +	 * Make sure the in-core data knows about this unlinked inode.  Since
> +	 * our iunlinks recovery basically just deletes the head of a bucket
> +	 * list until the bucket is empty, we need only to add the backref from
> +	 * the current list item to the next one, if this isn't the list tail.
> +	 */
>  	pag = xfs_perag_get(mp, agno);
>  	mutex_lock(&pag->pagi_unlinked_lock);
>  	pag->pagi_unlinked_count++;
> +	if (agino != NULLAGINO)
> +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> +				agino);
>  	mutex_unlock(&pag->pagi_unlinked_lock);
>  	xfs_perag_put(pag);
> +	if (error)
> +		goto fail_iput;
>  
>  	/*
>  	 * Prevent any DMAPI event from being sent when the reference on
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 6bfc985669e0..4eb97ddc915e 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -151,6 +151,7 @@ xfs_free_perag(
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
>  		ASSERT(pag->pagi_unlinked_count == 0 ||
>  		       XFS_FORCED_SHUTDOWN(mp));
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> @@ -231,6 +232,9 @@ xfs_initialize_perag(
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
>  		mutex_init(&pag->pagi_unlinked_lock);
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_iunlink_mutex;
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> @@ -241,6 +245,8 @@ xfs_initialize_perag(
>  	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
>  	return 0;
>  
> +out_iunlink_mutex:
> +	mutex_destroy(&pag->pagi_unlinked_lock);
>  out_hash_destroy:
>  	xfs_buf_hash_destroy(pag);
>  out_free_pag:
> @@ -253,6 +259,7 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 0fcc6b6a4f67..27a433e93d32 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -392,6 +392,7 @@ typedef struct xfs_perag {
>  	/* unlinked inodes */
>  	struct mutex		pagi_unlinked_lock;
>  	uint32_t		pagi_unlinked_count;
> +	struct rhashtable	pagi_unlinked_hash;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/8] xfs: track unlinked inode counts in per-ag data
  2019-02-01 18:59   ` Brian Foster
@ 2019-02-01 19:33     ` Darrick J. Wong
  2019-02-02 16:14       ` Christoph Hellwig
  0 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-01 19:33 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 01:59:24PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:03PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Track the number of unlinked inodes in each AG so that we can use these
> > decisions to throttle inactivations when the unlinked list gets long.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c       |   40 +++++++++++++++++++++++++++++-----------
> >  fs/xfs/xfs_log_recover.c |    8 ++++++++
> >  fs/xfs/xfs_mount.c       |    5 +++++
> >  fs/xfs/xfs_mount.h       |    4 ++++
> >  4 files changed, 46 insertions(+), 11 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index d18354517320..98355f5f9253 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1900,6 +1900,7 @@ xfs_iunlink(
> >  	struct xfs_dinode	*dip;
> >  	struct xfs_buf		*agibp;
> >  	struct xfs_buf		*ibp;
> > +	struct xfs_perag	*pag;
> >  	xfs_agnumber_t		agno;
> >  	xfs_agino_t		agino;
> >  	short			bucket_index;
> > @@ -1912,6 +1913,8 @@ xfs_iunlink(
> >  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> >  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> > +	pag = xfs_perag_get(mp, agno);
> > +	mutex_lock(&pag->pagi_unlinked_lock);
> 
> Any particular reason for using a mutex over a spinlock or atomic_t?

Thinking this over, the pagi_unlinked_lock protects pagi_unlinked_count,
which is only used to assert that we don't have any unlinked inodes left
over during a clean unmount.  It /was/ used in the patch to make
->destroy_inode callers wait for inactivation, but since I dropped that
patch I don't need the counter anymore.

Maybe we can just drop this patch entirely?  If we leak any unlinked
inodes they'll get cleaned up at next mount... wait, we never did merge
Eric's patch to reap unlinked inodes at mount time, did we?

--D

> Brian
> 
> >  
> >  	/*
> >  	 * Get the agi buffer first.  It ensures lock ordering
> > @@ -1919,7 +1922,7 @@ xfs_iunlink(
> >  	 */
> >  	error = xfs_read_agi(mp, tp, agno, &agibp);
> >  	if (error)
> > -		return error;
> > +		goto out_unlock;
> >  	agi = XFS_BUF_TO_AGI(agibp);
> >  
> >  	/*
> > @@ -1939,7 +1942,7 @@ xfs_iunlink(
> >  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> >  				       0, 0);
> >  		if (error)
> > -			return error;
> > +			goto out_unlock;
> >  
> >  		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
> >  		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> > @@ -1964,7 +1967,12 @@ xfs_iunlink(
> >  		(sizeof(xfs_agino_t) * bucket_index);
> >  	xfs_trans_log_buf(tp, agibp, offset,
> >  			  (offset + sizeof(xfs_agino_t) - 1));
> > -	return 0;
> > +	pag->pagi_unlinked_count++;
> > +
> > +out_unlock:
> > +	mutex_unlock(&pag->pagi_unlinked_lock);
> > +	xfs_perag_put(pag);
> > +	return error;
> >  }
> >  
> >  /*
> > @@ -1982,6 +1990,7 @@ xfs_iunlink_remove(
> >  	struct xfs_buf		*ibp;
> >  	struct xfs_buf		*last_ibp;
> >  	struct xfs_dinode	*last_dip = NULL;
> > +	struct xfs_perag	*pag;
> >  	xfs_ino_t		next_ino;
> >  	xfs_agnumber_t		agno;
> >  	xfs_agino_t		agino;
> > @@ -1997,6 +2006,8 @@ xfs_iunlink_remove(
> >  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> >  	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> > +	pag = xfs_perag_get(mp, agno);
> > +	mutex_lock(&pag->pagi_unlinked_lock);
> >  
> >  	/*
> >  	 * Get the agi buffer first.  It ensures lock ordering
> > @@ -2004,7 +2015,7 @@ xfs_iunlink_remove(
> >  	 */
> >  	error = xfs_read_agi(mp, tp, agno, &agibp);
> >  	if (error)
> > -		return error;
> > +		goto out_unlock;
> >  	agi = XFS_BUF_TO_AGI(agibp);
> >  
> >  	/*
> > @@ -2015,7 +2026,8 @@ xfs_iunlink_remove(
> >  			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
> >  		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> >  				agi, sizeof(*agi));
> > -		return -EFSCORRUPTED;
> > +		error = -EFSCORRUPTED;
> > +		goto out_unlock;
> >  	}
> >  
> >  	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
> > @@ -2031,7 +2043,7 @@ xfs_iunlink_remove(
> >  		if (error) {
> >  			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> >  				__func__, error);
> > -			return error;
> > +			goto out_unlock;
> >  		}
> >  		next_agino = be32_to_cpu(dip->di_next_unlinked);
> >  		ASSERT(next_agino != 0);
> > @@ -2080,7 +2092,7 @@ xfs_iunlink_remove(
> >  				xfs_warn(mp,
> >  	"%s: xfs_imap returned error %d.",
> >  					 __func__, error);
> > -				return error;
> > +				goto out_unlock;
> >  			}
> >  
> >  			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> > @@ -2089,7 +2101,7 @@ xfs_iunlink_remove(
> >  				xfs_warn(mp,
> >  	"%s: xfs_imap_to_bp returned error %d.",
> >  					__func__, error);
> > -				return error;
> > +				goto out_unlock;
> >  			}
> >  
> >  			last_offset = imap.im_boffset;
> > @@ -2098,7 +2110,8 @@ xfs_iunlink_remove(
> >  				XFS_CORRUPTION_ERROR(__func__,
> >  						XFS_ERRLEVEL_LOW, mp,
> >  						last_dip, sizeof(*last_dip));
> > -				return -EFSCORRUPTED;
> > +				error = -EFSCORRUPTED;
> > +				goto out_unlock;
> >  			}
> >  		}
> >  
> > @@ -2111,7 +2124,7 @@ xfs_iunlink_remove(
> >  		if (error) {
> >  			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
> >  				__func__, error);
> > -			return error;
> > +			goto out_unlock;
> >  		}
> >  		next_agino = be32_to_cpu(dip->di_next_unlinked);
> >  		ASSERT(next_agino != 0);
> > @@ -2146,7 +2159,12 @@ xfs_iunlink_remove(
> >  				  (offset + sizeof(xfs_agino_t) - 1));
> >  		xfs_inobp_check(mp, last_ibp);
> >  	}
> > -	return 0;
> > +	pag->pagi_unlinked_count--;
> > +
> > +out_unlock:
> > +	mutex_unlock(&pag->pagi_unlinked_lock);
> > +	xfs_perag_put(pag);
> > +	return error;
> >  }
> >  
> >  /*
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index ff9a27834c50..c920b8aeba01 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -5054,6 +5054,7 @@ xlog_recover_process_one_iunlink(
> >  	struct xfs_buf			*ibp;
> >  	struct xfs_dinode		*dip;
> >  	struct xfs_inode		*ip;
> > +	struct xfs_perag		*pag;
> >  	xfs_ino_t			ino;
> >  	int				error;
> >  
> > @@ -5077,6 +5078,13 @@ xlog_recover_process_one_iunlink(
> >  	agino = be32_to_cpu(dip->di_next_unlinked);
> >  	xfs_buf_relse(ibp);
> >  
> > +	/* Make sure the in-core data knows about this unlinked inode. */
> > +	pag = xfs_perag_get(mp, agno);
> > +	mutex_lock(&pag->pagi_unlinked_lock);
> > +	pag->pagi_unlinked_count++;
> > +	mutex_unlock(&pag->pagi_unlinked_lock);
> > +	xfs_perag_put(pag);
> > +
> >  	/*
> >  	 * Prevent any DMAPI event from being sent when the reference on
> >  	 * the inode is dropped.
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 10be706ec72e..6bfc985669e0 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -149,6 +149,9 @@ xfs_free_perag(
> >  		spin_unlock(&mp->m_perag_lock);
> >  		ASSERT(pag);
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> > +		ASSERT(pag->pagi_unlinked_count == 0 ||
> > +		       XFS_FORCED_SHUTDOWN(mp));
> > +		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > @@ -227,6 +230,7 @@ xfs_initialize_perag(
> >  		/* first new pag is fully initialized */
> >  		if (first_initialised == NULLAGNUMBER)
> >  			first_initialised = index;
> > +		mutex_init(&pag->pagi_unlinked_lock);
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > @@ -249,6 +253,7 @@ xfs_initialize_perag(
> >  		if (!pag)
> >  			break;
> >  		xfs_buf_hash_destroy(pag);
> > +		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		kmem_free(pag);
> >  	}
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index e344b1dfde63..0fcc6b6a4f67 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -388,6 +388,10 @@ typedef struct xfs_perag {
> >  
> >  	/* reference count */
> >  	uint8_t			pagf_refcount_level;
> > +
> > +	/* unlinked inodes */
> > +	struct mutex		pagi_unlinked_lock;
> > +	uint32_t		pagi_unlinked_count;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-02-01 19:29   ` Brian Foster
@ 2019-02-01 19:40     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-01 19:40 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 02:29:02PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> > 
> > The incore list structure records "X.next_unlinked = Y" relations, with
> > the rhashtable using Y to index the records.  This makes finding the
> > inode X that points to a inode Y very quick.  If our cache fails to find
> > anything we can always fall back on the old method.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> I still need to take a closer look at this one, but I want to make sure
> I grok what's happening from the unlinked list perspective...
> 
> >  fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_inode.h       |   10 ++
> >  fs/xfs/xfs_log_recover.c |   12 ++-
> >  fs/xfs/xfs_mount.c       |    7 +
> >  fs/xfs/xfs_mount.h       |    1 
> >  5 files changed, 245 insertions(+), 1 deletion(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 56349497d75b..eec3c6109fc6 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> ...
> > @@ -2072,6 +2255,9 @@ xfs_iunlink(
> >  		if (error)
> >  			goto out_unlock;
> >  		ASSERT(old_agino == NULLAGINO);
> > +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> > +		if (error)
> > +			goto out_unlock;
> 
> agino has been unlinked, add a backref from the next inode (i.e., we
> point to) back to agino.
> 
> >  	}
> >  
> >  	/* Point the head of the list to point to this inode. */
> > @@ -2144,6 +2330,17 @@ xfs_iunlink_map_prev(
> >  
> >  	ASSERT(head_agino != target_agino);
> >  
> > +	/* See if our backref cache can find it faster. */
> > +	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
> > +	if (error == 0) {
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > +				&last_dip, &last_ibp);
> > +		if (error)
> > +			return error;
> > +		goto out;
> > +	}
> > +
> >  	next_agino = head_agino;
> >  	while (next_agino != target_agino) {
> >  		xfs_agino_t	unlinked_agino;
> > @@ -2169,6 +2366,7 @@ xfs_iunlink_map_prev(
> >  		next_agino = unlinked_agino;
> >  	}
> >  
> > +out:
> >  	/* Should never happen, but don't return garbage. */
> >  	if (next_ino == NULLFSINO)
> >  		return -EFSCORRUPTED;
> > @@ -2243,6 +2441,12 @@ xfs_iunlink_remove(
> >  		if (error)
> >  			goto out_unlock;
> >  
> > +		if (next_agino != NULLAGINO) {
> > +			error = xfs_iunlink_forget_backref(pag, next_agino);
> > +			if (error)
> > +				goto out_unlock;
> > +		}
> 
> Removed from the head. We thus had no inode pointing to us, but the next
> record had a ref that we pointed to it. Drop that since we're gone and
> that inode is now the head.
> 
> > +
> >  		/* Point the head of the list to the next unlinked inode. */
> >  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> >  				next_agino);
> > @@ -2265,10 +2469,22 @@ xfs_iunlink_remove(
> >  				&next_agino);
> >  		if (error)
> >  			goto out_unlock;
> > +		if (next_agino != NULLAGINO) {
> > +			error = xfs_iunlink_forget_backref(pag, next_agino);
> > +			if (error)
> > +				goto out_unlock;
> > +		}
> 
> Removed from the middle. next_agino had a backref to us, so we drop that
> since we're going away.
> 
> >  
> >  		/* Point the previous inode on the list to the next inode. */
> >  		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
> >  				next_ino, next_agino);
> > +		if (next_agino == NULLAGINO)
> > +			error = xfs_iunlink_forget_backref(pag, agino);
> > +		else
> > +			error = xfs_iunlink_change_backref(pag, agino,
> > +					next_agino);
> 
> Now we deal with the backref for us (i.e., to whatever pointed at us and
> now points at something else). If it now points at a real inode, change
> the ref that pointed to us to point to our old next. If it doesn't,
> delete the ref that pointed to us. Am I following that correctly?
> 
> Hmm, I realize this may be what's happening behind these wrappers, but
> I'm wondering if the logic at this level would be easier to grok if we
> just dropped the backref that points to our ip (IIUC we should always
> have one here), if we point to something, drop the backref for that and
> then add a new one for the prev->next link we've just created.
> 
> But I still need to take a closer look at all of this... it might be
> more clear with some comments.

Yep, your understanding of what we're doing with the backrefs is correct.
I'll add some comments to make what we're doing clearer.

FWIW the reason for having a separate change_backref is to avoid freeing
the old backref only to allocate memory for a new backref immediately.

--D

> Brian
> 
> > +		if (error)
> > +			goto out_unlock;
> >  	}
> >  	pag->pagi_unlinked_count--;
> >  
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index be2014520155..9690f0d32e33 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -500,4 +500,14 @@ extern struct kmem_zone	*xfs_inode_zone;
> >  
> >  bool xfs_inode_verify_forks(struct xfs_inode *ip);
> >  
> > +int xfs_iunlink_init(struct xfs_perag *pag);
> > +void xfs_iunlink_destroy(struct xfs_perag *pag);
> > +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> > +		xfs_agino_t *prev_agino);
> > +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> > +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> > +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> > +
> >  #endif	/* __XFS_INODE_H__ */
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index c920b8aeba01..909b70550aa8 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -5078,12 +5078,22 @@ xlog_recover_process_one_iunlink(
> >  	agino = be32_to_cpu(dip->di_next_unlinked);
> >  	xfs_buf_relse(ibp);
> >  
> > -	/* Make sure the in-core data knows about this unlinked inode. */
> > +	/*
> > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > +	 * list until the bucket is empty, we need only to add the backref from
> > +	 * the current list item to the next one, if this isn't the list tail.
> > +	 */
> >  	pag = xfs_perag_get(mp, agno);
> >  	mutex_lock(&pag->pagi_unlinked_lock);
> >  	pag->pagi_unlinked_count++;
> > +	if (agino != NULLAGINO)
> > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > +				agino);
> >  	mutex_unlock(&pag->pagi_unlinked_lock);
> >  	xfs_perag_put(pag);
> > +	if (error)
> > +		goto fail_iput;
> >  
> >  	/*
> >  	 * Prevent any DMAPI event from being sent when the reference on
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 6bfc985669e0..4eb97ddc915e 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -151,6 +151,7 @@ xfs_free_perag(
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> >  		ASSERT(pag->pagi_unlinked_count == 0 ||
> >  		       XFS_FORCED_SHUTDOWN(mp));
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > @@ -231,6 +232,9 @@ xfs_initialize_perag(
> >  		if (first_initialised == NULLAGNUMBER)
> >  			first_initialised = index;
> >  		mutex_init(&pag->pagi_unlinked_lock);
> > +		error = xfs_iunlink_init(pag);
> > +		if (error)
> > +			goto out_iunlink_mutex;
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > @@ -241,6 +245,8 @@ xfs_initialize_perag(
> >  	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
> >  	return 0;
> >  
> > +out_iunlink_mutex:
> > +	mutex_destroy(&pag->pagi_unlinked_lock);
> >  out_hash_destroy:
> >  	xfs_buf_hash_destroy(pag);
> >  out_free_pag:
> > @@ -253,6 +259,7 @@ xfs_initialize_perag(
> >  		if (!pag)
> >  			break;
> >  		xfs_buf_hash_destroy(pag);
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		kmem_free(pag);
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 0fcc6b6a4f67..27a433e93d32 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -392,6 +392,7 @@ typedef struct xfs_perag {
> >  	/* unlinked inodes */
> >  	struct mutex		pagi_unlinked_lock;
> >  	uint32_t		pagi_unlinked_count;
> > +	struct rhashtable	pagi_unlinked_hash;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-02-01  8:03   ` Christoph Hellwig
@ 2019-02-01 23:59     ` Dave Chinner
  2019-02-02  4:31       ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Dave Chinner @ 2019-02-01 23:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Darrick J. Wong, linux-xfs

On Fri, Feb 01, 2019 at 12:03:43AM -0800, Christoph Hellwig wrote:
> On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> 
> This sounds pretty reasonable and a real review will follow.  But can
> you quantify the considerably speedups for real life workloads?

When you have more than a few hundred open-but-unlinked inodes in an
AG, the unlinked list walking starts to take a significant amount of
time when the final reference to an inode goes away. It's
essentially an O(N) buffer walk, so takes a lot of CPU time when a
list gets more than a few inode long.

With darrick's background inactivation, the lists grow to tens of
thousands of inodes very quickly. I was seeing >95% of CPU time
being spend in unlinked list walking on large scale 'rm -rf'
workloads and performance absolutely dropped off a cliff. My initial
code (from years and years ago) used a
double linked list, and when I forward ported that and ran it up,
the CPU overhead of unlink list removal was cut to almost nothing and
all the performance regressions with background inactivation went
away. i.e. Unlink list removal went from an O(N) overhead to always
being O(1), so the length of the list didnt' affect performance any
more.

Darrick's code replaces the list_head in each inode I used with a
rhashtable - it removes the 16 bytes per-inode memory footprint my
implementation required, but otherwise is identical.  Hence I expect
that it'll show exactly the same performance characteristics as the
existing code.

IOWs, it's really required for backgrounding and parallelising inode
inactivation, but otherwise it won't make any noticable/measurable
difference to most workloads because the unlinked lists almost never
grows more than one inode in length and so O(N) ~= O(1) almost all
the time...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-02-01 23:59     ` Dave Chinner
@ 2019-02-02  4:31       ` Darrick J. Wong
  2019-02-02 16:07         ` Christoph Hellwig
  0 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02  4:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, linux-xfs

On Sat, Feb 02, 2019 at 10:59:00AM +1100, Dave Chinner wrote:
> On Fri, Feb 01, 2019 at 12:03:43AM -0800, Christoph Hellwig wrote:
> > On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > up unlinked processing considerably when there are a lot of inodes on
> > > the unlinked list because iunlink_remove no longer has to traverse an
> > > entire bucket list to find which inode points to the one being removed.
> > 
> > This sounds pretty reasonable and a real review will follow.  But can
> > you quantify the considerably speedups for real life workloads?
> 
> When you have more than a few hundred open-but-unlinked inodes in an
> AG, the unlinked list walking starts to take a significant amount of
> time when the final reference to an inode goes away. It's
> essentially an O(N) buffer walk, so takes a lot of CPU time when a
> list gets more than a few inode long.
> 
> With darrick's background inactivation, the lists grow to tens of
> thousands of inodes very quickly. I was seeing >95% of CPU time
> being spend in unlinked list walking on large scale 'rm -rf'
> workloads and performance absolutely dropped off a cliff. My initial
> code (from years and years ago) used a
> double linked list, and when I forward ported that and ran it up,
> the CPU overhead of unlink list removal was cut to almost nothing and
> all the performance regressions with background inactivation went
> away. i.e. Unlink list removal went from an O(N) overhead to always
> being O(1), so the length of the list didnt' affect performance any
> more.
> 
> Darrick's code replaces the list_head in each inode I used with a
> rhashtable - it removes the 16 bytes per-inode memory footprint my
> implementation required, but otherwise is identical.  Hence I expect
> that it'll show exactly the same performance characteristics as the
> existing code.
> 
> IOWs, it's really required for backgrounding and parallelising inode
> inactivation, but otherwise it won't make any noticable/measurable
> difference to most workloads because the unlinked lists almost never
> grows more than one inode in length and so O(N) ~= O(1) almost all
> the time...

I wrote a silly program to open as many O_TMPFILE files as it can and
then close them all.  I wrapped that in a script to crank up ulimit -n
to fs.file_max (which on this VM was about 194000 files) and came up
with this on a 5.0-rc4 kernel with deferred inactivation and iunlink
backref caching:

+ /d/t/tmpfile/tmpfile
Opened 193519 files in 6.44s.
Closed 193519 files in 1.37s

real    0m7.818s
user    0m0.083s
sys     0m7.373s
+ cd /
+ umount /mnt

real    0m5.681s
user    0m0.008s
sys     0m0.000s

I then repeated the experiment with a vanilla 5.0-rc2 kernel I had lying
around and saw much slower results:

+ /d/t/tmpfile/tmpfile
Opened 193588 files in 6.35s.
Closed 193588 files in 751.61s

real    12m38.853s
user    0m0.084s
sys     12m34.470s
+ cd /
+ umount /mnt

real    0m0.086s
user    0m0.000s
sys     0m0.060s

The VM had 2G of RAM and a 3GB SCSI disk backed by a tmpfs file.

--D

Wrapper script:

#!/bin/bash -x

dir="$(dirname "$0")"

umount /mnt
rmmod xfs
mkfs.xfs -f /dev/sda
mount /dev/sda /mnt

ulimit -n $(cat /proc/sys/fs/file-max)

cd /mnt
time $dir/tmpfile
cd /
time umount /mnt

and tmpfile.c:

/* Open files and leak them. */
#define _GNU_SOURCE
#include <time.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

static int min_fd = -1;
static int max_fd = -1;
static unsigned int nr_opened = 0;
static float start_time;

void clock_time(float *time)
{
	static clockid_t clkid = CLOCK_MONOTONIC;
	struct timespec ts;
	int ret;

retry:
	ret = clock_gettime(clkid, &ts);
	if (ret) {
		if (clkid == CLOCK_MONOTONIC) {
			clkid = CLOCK_REALTIME;
			goto retry;
		}
		perror("clock_gettime");
		exit(2);
	}
	*time = ts.tv_sec + ((float)ts.tv_nsec / 1000000000);
}

/*
 * Exit the program due to an error.
 *
 * If we've exhausted all the file descriptors, make sure we close all the
 * open fds in the order we received them in order to exploit a quirk of ext4
 * and xfs where the oldest unlinked inodes are at the /end/ of the unlinked
 * lists, which will make removing the unlinked files maximally painful.
 *
 * If it's some other error, just die and let the kernel sort it out.
 */
void die(void)
{
	float end_time;
	int fd;

	switch (errno) {
	case EMFILE:
	case ENFILE:
	case ENOSPC:
		clock_time(&end_time);
		printf("Opened %u files in %.2fs.\n", nr_opened,
				end_time - start_time);
		clock_time(&start_time);

		for (fd = min_fd; fd <= max_fd; fd++)
			close(fd);
		clock_time(&end_time);
		printf("Closed %u files in %.2fs\n", nr_opened,
				end_time - start_time);
		exit(0);
		break;
	default:
		perror("open?");
		exit(2);
		break;
	}
}

/* Remember how many file we open and all that. */
void remember_fd(int fd)
{
	if (min_fd == -1 || min_fd > fd)
		min_fd = fd;
	if (max_fd == -1 || max_fd < fd)
		max_fd = fd;
	nr_opened++;
}

/* Put an opened file on the unlinked list and leak the fd. */
void leak_tmpfile(void)
{
	int fd = -1;
	int ret;

	/* Try to create an O_TMPFILE and leak the fd. */
#ifdef O_TMPFILE
	fd = open(".", O_TMPFILE | O_RDWR, 0644);
	if (fd >= 0) {
		remember_fd(fd);
		return;
	}
	if (fd < 0)
		die();
#endif

	/* Oh well, create a new file, unlink it, and leak the fd. */
	fd = open("./moo", O_CREAT | O_RDWR, 0644);
	if (fd < 0)
		die();
	ret = unlink("./moo");
	if (ret)
		die();
	remember_fd(fd);
}

/* Try to put as many files on the unlinked list and then kill them. */
int main(int argc, char *argv[])
{
	clock_time(&start_time);
	while (1)
		leak_tmpfile();
	return 0;
}

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-02-02  4:31       ` Darrick J. Wong
@ 2019-02-02 16:07         ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:07 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Dave Chinner, Christoph Hellwig, linux-xfs

On Fri, Feb 01, 2019 at 08:31:59PM -0800, Darrick J. Wong wrote:
> 
> I wrote a silly program to open as many O_TMPFILE files as it can and
> then close them all.  I wrapped that in a script to crank up ulimit -n
> to fs.file_max (which on this VM was about 194000 files) and came up
> with this on a 5.0-rc4 kernel with deferred inactivation and iunlink
> backref caching:

Ah, cool.  Maybe throw a one-line summary of the results into the commit
log for the next version.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/8] xfs: track unlinked inode counts in per-ag data
  2019-02-01 19:33     ` Darrick J. Wong
@ 2019-02-02 16:14       ` Christoph Hellwig
  2019-02-02 19:28         ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:14 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, linux-xfs

On Fri, Feb 01, 2019 at 11:33:48AM -0800, Darrick J. Wong wrote:
> Maybe we can just drop this patch entirely?  If we leak any unlinked
> inodes they'll get cleaned up at next mount... wait, we never did merge
> Eric's patch to reap unlinked inodes at mount time, did we?

Which patch?  We must recovery unlinked inodes during each mount that
requires log recovery and have done that since the very beginning.

I don't see how we could ever see unlinked inodes during a clean
mount.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] xfs: refactor AGI unlinked bucket updates
  2019-01-31 23:18 ` [PATCH 3/8] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
  2019-02-01 19:00   ` Brian Foster
@ 2019-02-02 16:21   ` Christoph Hellwig
  2019-02-02 19:51     ` Darrick J. Wong
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:21 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
> +	xfs_agino_t		old_value;
> +	int			offset;
> +
> +	ASSERT(new_agino == NULLAGINO ||
> +	       xfs_verify_agino(tp->t_mountp, be32_to_cpu(agi->agi_seqno),
> +				new_agino));
> +
> +	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> +	trace_xfs_iunlink_update_bucket(tp->t_mountp,
> +			be32_to_cpu(agi->agi_seqno), bucket_index, old_value,
> +			new_agino);

As already mentioned by Brian I'd rather pass the agno rather than
recalculating it twice.

> +	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
> +	offset = offsetof(struct xfs_agi, agi_unlinked) +
> +			(sizeof(xfs_agino_t) * bucket_index);
> +	xfs_trans_log_buf(tp, agibp, offset,
> +			(offset + sizeof(xfs_agino_t) - 1));

No need for the braces here.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks
  2019-01-31 23:18 ` [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
  2019-02-01 19:00   ` Brian Foster
@ 2019-02-02 16:22   ` Christoph Hellwig
  1 sibling, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:22 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 5/8] xfs: refactor inode unlinked pointer update functions
  2019-01-31 23:18 ` [PATCH 5/8] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
@ 2019-02-02 16:27   ` Christoph Hellwig
  2019-02-02 20:29     ` Darrick J. Wong
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:27 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +	ASSERT(new_agino == NULLAGINO ||
> +	       xfs_verify_agino(mp, XFS_INO_TO_AGNO(mp, ino), new_agino));

We have quite a few uses of this pattern.  Shouldn't we just add a
a xfs_verify_agino_or_null helper?

Also given that the caller has the agno and we use it a few times
maybe pass it as an argument?

Also shouldn't new_agino be called something like next_agino or at
least have a little comment explaining what it stands for?

> +	xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1));

No need for the innter braces.

Otherwise this looks conceptually fine to me.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
@ 2019-02-02 16:30   ` Christoph Hellwig
  2019-02-02 20:42     ` Darrick J. Wong
  2019-02-02 16:51   ` Christoph Hellwig
  2 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:30 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

Nitpick: why hoist here and refactor in the previous patches subject?

> +	xfs_agino_t		target_agino,
> +	xfs_ino_t		*ino,
> +	struct xfs_imap		*imap,
> +	struct xfs_dinode	**dipp,
> +	struct xfs_buf		**bpp)
> +{
> +	struct xfs_imap		last_imap;
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_buf		*last_ibp = NULL;
> +	struct xfs_dinode	*last_dip;
> +	xfs_ino_t		next_ino = NULLFSINO;
> +	xfs_agino_t		next_agino;
> +	int			error;
> +
> +	ASSERT(head_agino != target_agino);
> +
> +	next_agino = head_agino;
> +	while (next_agino != target_agino) {
> +		xfs_agino_t	unlinked_agino;
> +
> +		if (last_ibp)
> +			xfs_trans_brelse(tp, last_ibp);
> +
> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> +				&last_dip, &last_ibp);

And reason we stop formatting directly into the passed in imap,
as the old code did?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
  2019-02-01 19:01   ` Brian Foster
  2019-02-02 16:30   ` Christoph Hellwig
@ 2019-02-02 16:51   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 16:51 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

> +/* Return the imap, dinode pointer, and buffer for an inode. */
> +STATIC int
> +xfs_iunlink_map_ino(
> +	struct xfs_trans	*tp,
> +	xfs_ino_t		next_ino,
> +	struct xfs_imap		*imap,
> +	struct xfs_dinode	**dipp,
> +	struct xfs_buf		**bpp)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	int			error;
> +
> +	imap->im_blkno = 0;
> +	error = xfs_imap(mp, tp, next_ino, imap, 0);

Looking at the next patch it actually would be nicer if we'd pass
agno/agino instead of the inode number to this function,
as currently both callers need to duplicate the caculation of the
ino.

In fact the first thing xfs_imap does is to split the ino into agno
and agino again, so maybe we should throw in a prep patch to change
the xfs_imap calling convention as well.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable
  2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
  2019-02-01  8:03   ` Christoph Hellwig
  2019-02-01 19:29   ` Brian Foster
@ 2019-02-02 17:01   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2019-02-02 17:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> +/*
> + * Faster In-Core Unlinked List Lookups
> + * ====================================

I'd drop the "Faster " here - the whole point of a separate in-core
lookup is generally that it is faster.

> +struct xfs_iunlink_key {
> +	xfs_agino_t		iu_next_unlinked;
> +};

I don't think we need this structure at all.  We can just directly
pass a pointer to the xfs_agino_t instead.

> +	.obj_cmpfn		= _xfs_iunlink_obj_cmp,

No real need for the _ prefix.  Also it would generally be nice
if the method implementation name is prefix + method name.

> +int
> +xfs_iunlink_lookup_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		agino,
> +	xfs_agino_t		*prev_agino)
> +{
> +	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
> +	struct xfs_iunlink	*iu;
> +
> +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
> +			xfs_iunlink_hash_params);
> +	if (!iu)
> +		return -ENOENT;
> +	*prev_agino = iu->iu_agino;
> +	return 0;

I think we can just retun the xfs_agino_t as the return value, with
NULLAGINO as the not found return value.

> +
> +/* Forget that X.next_unlinked = @agino. */
> +int
> +xfs_iunlink_forget_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		agino)
> +{
> +	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
> +	struct xfs_iunlink	*iu;
> +	int			error;
> +
> +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
> +			xfs_iunlink_hash_params);
> +	if (!iu)
> +		return -ENOENT;
> +
> +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> +	if (error)
> +		return error;
> +
> +	kmem_free(iu);
> +	return 0;

This mostly duplicates the change_backref help.  If we just free
the iu for a NULLAGINO next_unlinked there we can merge the two
functions easily.

> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> +		xfs_agino_t *prev_agino);
> +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);

Half of these aren't used outside xfs_inode.c.

> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> @@ -231,6 +232,9 @@ xfs_initialize_perag(
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
>  		mutex_init(&pag->pagi_unlinked_lock);
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_iunlink_mutex;

Any good reason pagi_unlinked_lock isn't handled in
xfs_iunlink_init / xfs_iunlink_destroy?

Untested patch for most of my comments above:

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e797b11ad842..f8da8ce48348 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1907,7 +1907,7 @@ xfs_inactive(
 }
 
 /*
- * Faster In-Core Unlinked List Lookups
+ * In-Core Unlinked List Lookups
  * ====================================
  *
  * Every inode is supposed to be reachable from some other piece of metadata
@@ -1937,20 +1937,16 @@ struct xfs_iunlink {
 	xfs_agino_t		iu_next_unlinked;
 };
 
-struct xfs_iunlink_key {
-	xfs_agino_t		iu_next_unlinked;
-};
-
 /* Unlinked list predecessor lookup hashtable construction */
 static int
-xfs_iunlink_obj_cmp(
+xfs_iunlink_obj_cmpfn(
 	struct rhashtable_compare_arg	*arg,
 	const void			*obj)
 {
-	const struct xfs_iunlink_key	*key = arg->key;
+	const xfs_agino_t		*key = arg->key;
 	const struct xfs_iunlink	*iu = obj;
 
-	if (iu->iu_next_unlinked != key->iu_next_unlinked)
+	if (iu->iu_next_unlinked != *key)
 		return 1;
 	return 0;
 }
@@ -1962,28 +1958,25 @@ static const struct rhashtable_params xfs_iunlink_hash_params = {
 	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
 	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
 	.automatic_shrinking	= true,
-	.obj_cmpfn		= xfs_iunlink_obj_cmp,
+	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
 };
 
 /*
- * Return X (in @prev_agino), where X.next_unlinked == @agino.  Returns -ENOENT
- * if no such relation is found.
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
  */
-static int
+static xfs_agino_t
 xfs_iunlink_lookup_backref(
 	struct xfs_perag	*pag,
-	xfs_agino_t		agino,
-	xfs_agino_t		*prev_agino)
+	xfs_agino_t		agino)
 {
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
 	struct xfs_iunlink	*iu;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
 			xfs_iunlink_hash_params);
 	if (!iu)
-		return -ENOENT;
-	*prev_agino = iu->iu_agino;
-	return 0;
+		return NULLAGINO;
+	return iu->iu_agino;
 }
 
 /* Remember that @prev_agino.next_unlinked = @this_agino. */
@@ -2003,31 +1996,6 @@ xfs_iunlink_add_backref(
 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
 }
 
-/* Forget that X.next_unlinked = @agino. */
-static int
-xfs_iunlink_forget_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		agino)
-{
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
-	struct xfs_iunlink	*iu;
-	int			error;
-
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
-			xfs_iunlink_hash_params);
-	if (!iu)
-		return -ENOENT;
-
-	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	if (error)
-		return error;
-
-	kmem_free(iu);
-	return 0;
-}
-
-
 /* Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. */
 static int
 xfs_iunlink_change_backref(
@@ -2035,11 +2003,10 @@ xfs_iunlink_change_backref(
 	xfs_agino_t		agino,
 	xfs_agino_t		next_unlinked)
 {
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
 	struct xfs_iunlink	*iu;
 	int			error;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
 			xfs_iunlink_hash_params);
 	if (!iu)
 		return -ENOENT;
@@ -2049,8 +2016,18 @@ xfs_iunlink_change_backref(
 	if (error)
 		return error;
 
-	iu->iu_next_unlinked = next_unlinked;
+	/*
+	 * If there is no new next entry just free our item and return.
+	 */
+	if (next_unlinked == NULLAGINO) {
+		kmem_free(iu);
+		return 0;
+	}
 
+	/*
+	 * Else update the hash table to the new entry.
+	 */
+	iu->iu_next_unlinked = next_unlinked;
 	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
 }
@@ -2357,8 +2334,8 @@ xfs_iunlink_map_prev(
 	ASSERT(head_agino != target_agino);
 
 	/* See if our backref cache can find it faster. */
-	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
-	if (error == 0) {
+	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
+	if (next_agino != NULLAGINO) {
 		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
 		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
 				&last_dip, &last_ibp);
@@ -2468,7 +2445,8 @@ xfs_iunlink_remove(
 			goto out_unlock;
 
 		if (next_agino != NULLAGINO) {
-			error = xfs_iunlink_forget_backref(pag, next_agino);
+			error = xfs_iunlink_change_backref(pag, next_agino,
+					NULLAGINO);
 			if (error)
 				goto out_unlock;
 		}
@@ -2496,7 +2474,8 @@ xfs_iunlink_remove(
 		if (error)
 			goto out_unlock;
 		if (next_agino != NULLAGINO) {
-			error = xfs_iunlink_forget_backref(pag, next_agino);
+			error = xfs_iunlink_change_backref(pag, next_agino,
+					NULLAGINO);
 			if (error)
 				goto out_unlock;
 		}
@@ -2504,11 +2483,7 @@ xfs_iunlink_remove(
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
 				next_ino, next_agino);
-		if (next_agino == NULLAGINO)
-			error = xfs_iunlink_forget_backref(pag, agino);
-		else
-			error = xfs_iunlink_change_backref(pag, agino,
-					next_agino);
+		error = xfs_iunlink_change_backref(pag, agino, next_agino);
 		if (error)
 			goto out_unlock;
 	}

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] xfs: clean up iunlink functions
  2019-02-01  8:01   ` Christoph Hellwig
@ 2019-02-02 19:15     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 19:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 12:01:36AM -0800, Christoph Hellwig wrote:
> On Thu, Jan 31, 2019 at 03:17:56PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Fix some indentation issues with the iunlink functions and reorganize
> > the tops of the functions to be identical.  No functional changes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c |   77 +++++++++++++++++++++++++++-------------------------
> >  1 file changed, 40 insertions(+), 37 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index c8bf02be0003..d18354517320 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1892,26 +1892,32 @@ xfs_inactive(
> >   */
> >  STATIC int
> >  xfs_iunlink(
> > -	struct xfs_trans *tp,
> > -	struct xfs_inode *ip)
> > +	struct xfs_trans	*tp,
> > +	struct xfs_inode	*ip)
> >  {
> > -	xfs_mount_t	*mp = tp->t_mountp;
> > -	xfs_agi_t	*agi;
> > -	xfs_dinode_t	*dip;
> > -	xfs_buf_t	*agibp;
> > -	xfs_buf_t	*ibp;
> > -	xfs_agino_t	agino;
> > -	short		bucket_index;
> > -	int		offset;
> > -	int		error;
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	struct xfs_agi		*agi;
> > +	struct xfs_dinode	*dip;
> > +	struct xfs_buf		*agibp;
> > +	struct xfs_buf		*ibp;
> > +	xfs_agnumber_t		agno;
> > +	xfs_agino_t		agino;
> > +	short			bucket_index;
> > +	int			offset;
> > +	int			error;
> >  
> >  	ASSERT(VFS_I(ip)->i_mode != 0);
> > +	ASSERT(xfs_verify_ino(mp, ip->i_ino));
> 
> This verify_ino calls seems odd given that we pass in an initialized
> inode struct..
> 
> > +
> > +	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> > +	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> 
> And then we could move these initializations up int othe lines with
> the variable declarations.
> 
> > +	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> >  
> >  	/*
> >  	 * Get the agi buffer first.  It ensures lock ordering
> >  	 * on the list.
> >  	 */
> 
> If you are touching this anyway, the whole comment could be changed
> into a single line..
> 
> > +	if (!xfs_verify_ino(mp, ip->i_ino))
> > +		return -EFSCORRUPTED;
> >  
> > -	mp = tp->t_mountp;
> >  	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> > +	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> 
> Same here.

Fixed all of these.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/8] xfs: track unlinked inode counts in per-ag data
  2019-02-02 16:14       ` Christoph Hellwig
@ 2019-02-02 19:28         ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 19:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Brian Foster, linux-xfs

On Sat, Feb 02, 2019 at 08:14:04AM -0800, Christoph Hellwig wrote:
> On Fri, Feb 01, 2019 at 11:33:48AM -0800, Darrick J. Wong wrote:
> > Maybe we can just drop this patch entirely?  If we leak any unlinked
> > inodes they'll get cleaned up at next mount... wait, we never did merge
> > Eric's patch to reap unlinked inodes at mount time, did we?
> 
> Which patch?  We must recovery unlinked inodes during each mount that
> requires log recovery and have done that since the very beginning.

I /think/ the patch was "xfs: don't require a dirty log on snapshots".

> I don't see how we could ever see unlinked inodes during a clean
> mount.

/me had thought it was related to something about root filesystems that
are mounted ro and then remounted rw, but <shrug> I can't recall ATM.

Anyway, I'm still not sure we need all this mutex+counter infrastructure
for a debugging assert...

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] xfs: refactor AGI unlinked bucket updates
  2019-02-01 19:00   ` Brian Foster
@ 2019-02-02 19:50     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 19:50 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 02:00:43PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:09PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Split the AGI unlinked bucket updates into a separate function.  No
> > functional changes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c |   69 ++++++++++++++++++++++++++++++++++++++--------------
> >  fs/xfs/xfs_trace.h |   26 ++++++++++++++++++++
> >  2 files changed, 76 insertions(+), 19 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 98355f5f9253..3c33136b21ef 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1880,6 +1880,46 @@ xfs_inactive(
> >  	xfs_qm_dqdetach(ip);
> >  }
> >  
> > +/*
> > + * Point the AGI unlinked bucket at an inode and log the results.  The caller
> > + * is responsible for validating the old value.
> > + */
> > +STATIC int
> > +xfs_iunlink_update_bucket(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_buf		*agibp,
> > +	unsigned int		bucket_index,
> > +	xfs_agino_t		new_agino)
> > +{
> > +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
> > +	xfs_agino_t		old_value;
> > +	int			offset;
> > +
> > +	ASSERT(new_agino == NULLAGINO ||
> > +	       xfs_verify_agino(tp->t_mountp, be32_to_cpu(agi->agi_seqno),
> > +				new_agino));
> > +
> > +	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> > +	trace_xfs_iunlink_update_bucket(tp->t_mountp,
> > +			be32_to_cpu(agi->agi_seqno), bucket_index, old_value,
> > +			new_agino);
> > +
> 
> Might as well pass agno as a param, particularly since the callers
> already have it.

Will fix.

> > +	/*
> > +	 * We should never find the head of the list already set to the value
> > +	 * passed in because either we're adding or removing ourselves from the
> > +	 * head of the list.
> > +	 */
> > +	if (old_value == new_agino)
> > +		return -EFSCORRUPTED;
> > +
> 
> This seems a little weird in the NULLAGINO case. That probably should
> still never happen, but doesn't seem indicative of corruption on its
> own. *shrug* not a big deal.

<nod> We shouldn't ever be pulling the last item off an unlinked bucket
only to find that the bucket was empty, but if we do then it's a sign
that something has gone seriously wrong with our locking or something.

--D

> Brian
> 
> > +	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
> > +	offset = offsetof(struct xfs_agi, agi_unlinked) +
> > +			(sizeof(xfs_agino_t) * bucket_index);
> > +	xfs_trans_log_buf(tp, agibp, offset,
> > +			(offset + sizeof(xfs_agino_t) - 1));
> > +	return 0;
> > +}
> > +
> >  /*
> >   * This is called when the inode's link count goes to 0 or we are creating a
> >   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> > @@ -1958,15 +1998,10 @@ xfs_iunlink(
> >  		xfs_inobp_check(mp, ibp);
> >  	}
> >  
> > -	/*
> > -	 * Point the bucket head pointer at the inode being inserted.
> > -	 */
> > -	ASSERT(agino != 0);
> > -	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
> > -	offset = offsetof(xfs_agi_t, agi_unlinked) +
> > -		(sizeof(xfs_agino_t) * bucket_index);
> > -	xfs_trans_log_buf(tp, agibp, offset,
> > -			  (offset + sizeof(xfs_agino_t) - 1));
> > +	/* Point the head of the list to point to this inode. */
> > +	error = xfs_iunlink_update_bucket(tp, agibp, bucket_index, agino);
> > +	if (error)
> > +		goto out_unlock;
> >  	pag->pagi_unlinked_count++;
> >  
> >  out_unlock:
> > @@ -2062,16 +2097,12 @@ xfs_iunlink_remove(
> >  		} else {
> >  			xfs_trans_brelse(tp, ibp);
> >  		}
> > -		/*
> > -		 * Point the bucket head pointer at the next inode.
> > -		 */
> > -		ASSERT(next_agino != 0);
> > -		ASSERT(next_agino != agino);
> > -		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
> > -		offset = offsetof(xfs_agi_t, agi_unlinked) +
> > -			(sizeof(xfs_agino_t) * bucket_index);
> > -		xfs_trans_log_buf(tp, agibp, offset,
> > -				  (offset + sizeof(xfs_agino_t) - 1));
> > +
> > +		/* Point the head of the list to the next unlinked inode. */
> > +		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> > +				next_agino);
> > +		if (error)
> > +			goto out_unlock;
> >  	} else {
> >  		/*
> >  		 * We need to search the list for the inode being freed.
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 6fcc893dfc91..c10478e7e49a 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
> >  DEFINE_TRANS_EVENT(xfs_trans_add_item);
> >  DEFINE_TRANS_EVENT(xfs_trans_free_items);
> >  
> > +TRACE_EVENT(xfs_iunlink_update_bucket,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
> > +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> > +	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(unsigned int, bucket)
> > +		__field(xfs_agino_t, old_ptr)
> > +		__field(xfs_agino_t, new_ptr)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->bucket = bucket;
> > +		__entry->old_ptr = old_ptr;
> > +		__entry->new_ptr = new_ptr;
> > +	),
> > +	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->agno,
> > +		  __entry->bucket,
> > +		  __entry->old_ptr,
> > +		  __entry->new_ptr)
> > +);
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] xfs: refactor AGI unlinked bucket updates
  2019-02-02 16:21   ` Christoph Hellwig
@ 2019-02-02 19:51     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 19:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Sat, Feb 02, 2019 at 08:21:21AM -0800, Christoph Hellwig wrote:
> > +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
> > +	xfs_agino_t		old_value;
> > +	int			offset;
> > +
> > +	ASSERT(new_agino == NULLAGINO ||
> > +	       xfs_verify_agino(tp->t_mountp, be32_to_cpu(agi->agi_seqno),
> > +				new_agino));
> > +
> > +	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> > +	trace_xfs_iunlink_update_bucket(tp->t_mountp,
> > +			be32_to_cpu(agi->agi_seqno), bucket_index, old_value,
> > +			new_agino);
> 
> As already mentioned by Brian I'd rather pass the agno rather than
> recalculating it twice.
> 
> > +	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
> > +	offset = offsetof(struct xfs_agi, agi_unlinked) +
> > +			(sizeof(xfs_agino_t) * bucket_index);
> > +	xfs_trans_log_buf(tp, agibp, offset,
> > +			(offset + sizeof(xfs_agino_t) - 1));
> 
> No need for the braces here.

Will fix.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 5/8] xfs: refactor inode unlinked pointer update functions
  2019-02-02 16:27   ` Christoph Hellwig
@ 2019-02-02 20:29     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 20:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Sat, Feb 02, 2019 at 08:27:37AM -0800, Christoph Hellwig wrote:
> > +	ASSERT(new_agino == NULLAGINO ||
> > +	       xfs_verify_agino(mp, XFS_INO_TO_AGNO(mp, ino), new_agino));
> 
> We have quite a few uses of this pattern.  Shouldn't we just add a
> a xfs_verify_agino_or_null helper?

Yeah, I'll clean that up.

> Also given that the caller has the agno and we use it a few times
> maybe pass it as an argument?
> 
> Also shouldn't new_agino be called something like next_agino or at
> least have a little comment explaining what it stands for?
> 
> > +	xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1));
> 
> No need for the innter braces.
> 
> Otherwise this looks conceptually fine to me.

Ok, will fix this and the others.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-02-02 16:30   ` Christoph Hellwig
@ 2019-02-02 20:42     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 20:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Sat, Feb 02, 2019 at 08:30:07AM -0800, Christoph Hellwig wrote:
> Nitpick: why hoist here and refactor in the previous patches subject?

Will change.

> > +	xfs_agino_t		target_agino,
> > +	xfs_ino_t		*ino,
> > +	struct xfs_imap		*imap,
> > +	struct xfs_dinode	**dipp,
> > +	struct xfs_buf		**bpp)
> > +{
> > +	struct xfs_imap		last_imap;
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	struct xfs_buf		*last_ibp = NULL;
> > +	struct xfs_dinode	*last_dip;
> > +	xfs_ino_t		next_ino = NULLFSINO;
> > +	xfs_agino_t		next_agino;
> > +	int			error;
> > +
> > +	ASSERT(head_agino != target_agino);
> > +
> > +	next_agino = head_agino;
> > +	while (next_agino != target_agino) {
> > +		xfs_agino_t	unlinked_agino;
> > +
> > +		if (last_ibp)
> > +			xfs_trans_brelse(tp, last_ibp);
> > +
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > +				&last_dip, &last_ibp);
> 
> And reason we stop formatting directly into the passed in imap,
> as the old code did?

General prinicple of not formatting into the caller's variables until
we're sure we're returning 0 but I'll change it to format directly since
it'll save stack space.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-02-01 19:01   ` Brian Foster
@ 2019-02-02 20:46     ` Darrick J. Wong
  2019-02-04 13:18       ` Brian Foster
  0 siblings, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 20:46 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 02:01:38PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:27PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > There's a loop that searches an unlinked bucket list to find the inode
> > that points to a given inode.  Hoist this into a separate function;
> > later we'll use our iunlink backref cache to bypass the slow list
> > operation.  No functional changes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c |  136 ++++++++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 99 insertions(+), 37 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index ea38b66fbc59..d5b3f8fdac7e 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -2084,6 +2084,100 @@ xfs_iunlink(
> >  	return error;
> >  }
> >  
> > +/* Return the imap, dinode pointer, and buffer for an inode. */
> > +STATIC int
> > +xfs_iunlink_map_ino(
> > +	struct xfs_trans	*tp,
> > +	xfs_ino_t		next_ino,
> 
> Might be good to clean up some of these variable names as this code gets
> factored out. E.g., "next_ino" doesn't mean much more than "ino" in the
> context of this helper.

Ok.

> > +	struct xfs_imap		*imap,
> > +	struct xfs_dinode	**dipp,
> > +	struct xfs_buf		**bpp)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	int			error;
> > +
> > +	imap->im_blkno = 0;
> > +	error = xfs_imap(mp, tp, next_ino, imap, 0);
> > +	if (error) {
> > +		xfs_warn(mp, "%s: xfs_imap returned error %d.",
> > +				__func__, error);
> > +		return error;
> > +	}
> > +
> > +	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
> > +	if (error) {
> > +		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> > +				__func__, error);
> > +		return error;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Walk the unlinked chain from @head_agino until we find the inode that
> > + * points to @target_ino.  Return the inode number, map, dinode pointer,
> > + * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
> > + *
> > + * Do not call this function if @target_agino is the head of the list.
> > + */
> > +STATIC int
> > +xfs_iunlink_map_prev(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_perag	*pag,
> > +	xfs_agino_t		head_agino,
> > +	xfs_agino_t		target_agino,
> > +	xfs_ino_t		*ino,
> > +	struct xfs_imap		*imap,
> > +	struct xfs_dinode	**dipp,
> > +	struct xfs_buf		**bpp)
> > +{
> 
> It would also be nice to have some oneline comments next to the params
> of some of these functions.

In addition to the description given above the function?

> > +	struct xfs_imap		last_imap;
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	struct xfs_buf		*last_ibp = NULL;
> > +	struct xfs_dinode	*last_dip;
> > +	xfs_ino_t		next_ino = NULLFSINO;
> > +	xfs_agino_t		next_agino;
> > +	int			error;
> > +
> > +	ASSERT(head_agino != target_agino);
> > +
> > +	next_agino = head_agino;
> > +	while (next_agino != target_agino) {
> > +		xfs_agino_t	unlinked_agino;
> > +
> > +		if (last_ibp)
> > +			xfs_trans_brelse(tp, last_ibp);
> > +
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > +				&last_dip, &last_ibp);
> > +		if (error)
> > +			return error;
> > +
> > +		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > +		if (!xfs_verify_agino(mp, pag->pag_agno, next_agino) ||
> 
> Should the next_agino above be unlinked_agino? Also is the == check
> below a loop check (comment pls)?

Yes, fixed.

/*
 * Make sure this pointer is valid and isn't an obvious
 * infinite loop.
 */


> > +		    next_agino == unlinked_agino) {
> > +			XFS_CORRUPTION_ERROR(__func__,
> > +					XFS_ERRLEVEL_LOW, mp,
> > +					last_dip, sizeof(*last_dip));
> > +			error = -EFSCORRUPTED;
> > +			return error;
> > +		}
> > +		next_agino = unlinked_agino;
> > +	}
> > +
> > +	/* Should never happen, but don't return garbage. */
> > +	if (next_ino == NULLFSINO)
> > +		return -EFSCORRUPTED;
> > +
> > +	*ino = next_ino;
> > +	memcpy(imap, &last_imap, sizeof(last_imap));
> > +	*dipp = last_dip;
> > +	*bpp = last_ibp;
> > +	return 0;
> > +}
> > +
> >  /*
> >   * Pull the on-disk inode from the AGI unlinked list.
> >   */
> > @@ -2153,43 +2247,11 @@ xfs_iunlink_remove(
> >  	} else {
> >  		struct xfs_imap	imap;
> >  
> > -		/*
> > -		 * We need to search the list for the inode being freed.
> > -		 */
> > -		last_ibp = NULL;
> > -		while (next_agino != agino) {
> > -			if (last_ibp)
> > -				xfs_trans_brelse(tp, last_ibp);
> > -
> > -			imap.im_blkno = 0;
> > -			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
> > -
> > -			error = xfs_imap(mp, tp, next_ino, &imap, 0);
> > -			if (error) {
> > -				xfs_warn(mp,
> > -	"%s: xfs_imap returned error %d.",
> > -					 __func__, error);
> > -				goto out_unlock;
> > -			}
> > -
> > -			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> > -					       &last_ibp, 0, 0);
> > -			if (error) {
> > -				xfs_warn(mp,
> > -	"%s: xfs_imap_to_bp returned error %d.",
> > -					__func__, error);
> > -				goto out_unlock;
> > -			}
> > -
> > -			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > -			if (!xfs_verify_agino(mp, agno, next_agino)) {
> > -				XFS_CORRUPTION_ERROR(__func__,
> > -						XFS_ERRLEVEL_LOW, mp,
> > -						last_dip, sizeof(*last_dip));
> > -				error = -EFSCORRUPTED;
> > -				goto out_unlock;
> > -			}
> > -		}
> > +		/* We need to search the list for the inode being freed. */
> > +		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
> > +				&next_ino, &imap, &last_dip, &last_ibp);
> 
> Another variable nit... next_ino is kind of confusing here, particularly
> where we still use next_agino as well. The former is actually the "prev"
> ino output, right?

Yep, will fix.

--D

> Brian
> 
> > +		if (error)
> > +			goto out_unlock;
> >  
> >  		/*
> >  		 * Now last_ibp points to the buffer previous to us on the
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 5/8] xfs: refactor inode unlinked pointer update functions
  2019-02-01 19:01   ` Brian Foster
@ 2019-02-02 22:00     ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-02 22:00 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Feb 01, 2019 at 02:01:17PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:21PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Hoist the functions that update an inode's unlinked pointer updates into
> > a helper.  No functional changes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c |  188 +++++++++++++++++++++++++++-------------------------
> >  fs/xfs/xfs_trace.h |   26 +++++++
> >  2 files changed, 124 insertions(+), 90 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 4ddda3f3255f..ea38b66fbc59 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1920,6 +1920,88 @@ xfs_iunlink_update_bucket(
> >  	return 0;
> >  }
> >  
> > +/* Set an on-disk inode's unlinked pointer and return the old value. */
> 
> Doesn't look like this one returns anything.

Heh, oops, will fix.

> > +STATIC void
> > +xfs_iunlink_update_dinode(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_buf		*ibp,
> > +	struct xfs_dinode	*dip,
> > +	struct xfs_imap		*imap,
> > +	xfs_ino_t		ino,
> > +	xfs_agino_t		new_agino)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	int			offset;
> > +
> > +	ASSERT(new_agino == NULLAGINO ||
> > +	       xfs_verify_agino(mp, XFS_INO_TO_AGNO(mp, ino), new_agino));
> > +
> > +	trace_xfs_iunlink_update_dinode(mp,
> > +			XFS_INO_TO_AGNO(mp, ino),
> > +			XFS_INO_TO_AGINO(mp, ino),
> > +			be32_to_cpu(dip->di_next_unlinked), new_agino);
> > +
> > +	dip->di_next_unlinked = cpu_to_be32(new_agino);
> > +	offset = imap->im_boffset +
> > +			offsetof(struct xfs_dinode, di_next_unlinked);
> > +
> > +	/* need to recalc the inode CRC if appropriate */
> > +	xfs_dinode_calc_crc(mp, dip);
> > +	xfs_trans_inode_buf(tp, ibp);
> > +	xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1));
> > +	xfs_inobp_check(mp, ibp);
> > +}
> > +
> > +/* Set an in-core inode's unlinked pointer and return the old value. */
> > +STATIC int
> > +xfs_iunlink_update_inode(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_inode	*ip,
> > +	xfs_agino_t		new_agino,
> > +	xfs_agino_t		*old_agino)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	struct xfs_dinode	*dip;
> > +	struct xfs_buf		*ibp;
> > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> > +	xfs_agino_t		old_value;
> > +	int			error;
> > +
> > +	ASSERT(new_agino == NULLAGINO || xfs_verify_agino(mp, agno, new_agino));
> > +
> > +	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Make sure the old pointer isn't garbage. */
> > +	old_value = be32_to_cpu(dip->di_next_unlinked);
> > +	if (old_value != NULLAGINO && !xfs_verify_agino(mp, agno, old_value)) {
> > +		error = -EFSCORRUPTED;
> > +		goto out;
> > +	}
> > +	*old_agino = old_value;
> > +
> > +	/*
> > +	 * Since we're updating a linked list, we should never find that the
> > +	 * current pointer is the same as the new value, unless we're
> > +	 * terminating the list.
> > +	 */
> > +	*old_agino = old_value;
> 
> Double assign of *old_agino.

Will fix.

> > +	if (old_value == new_agino) {
> > +		if (new_agino != NULLAGINO)
> > +			error = -EFSCORRUPTED;
> > +		goto out;
> > +	}
> > +
> > +	/* Ok, update the new pointer. */
> > +	xfs_iunlink_update_dinode(tp, ibp, dip, &ip->i_imap, ip->i_ino,
> > +			new_agino);
> > +	return 0;
> > +out:
> > +	xfs_trans_brelse(tp, ibp);
> > +	return error;
> > +}
> > +
> >  /*
> >   * This is called when the inode's link count goes to 0 or we are creating a
> >   * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
> > @@ -1937,15 +2019,12 @@ xfs_iunlink(
> >  {
> >  	struct xfs_mount	*mp = tp->t_mountp;
> >  	struct xfs_agi		*agi;
> > -	struct xfs_dinode	*dip;
> >  	struct xfs_buf		*agibp;
> > -	struct xfs_buf		*ibp;
> >  	struct xfs_perag	*pag;
> >  	xfs_agnumber_t		agno;
> >  	xfs_agino_t		next_agino;
> >  	xfs_agino_t		agino;
> >  	short			bucket_index;
> > -	int			offset;
> >  	int			error;
> >  
> >  	ASSERT(VFS_I(ip)->i_mode != 0);
> > @@ -1980,29 +2059,17 @@ xfs_iunlink(
> >  	}
> >  
> >  	if (next_agino != NULLAGINO) {
> > +		xfs_agino_t	old_agino;
> > +
> >  		/*
> >  		 * There is already another inode in the bucket we need
> >  		 * to add ourselves to.  Add us at the front of the list.
> > -		 * Here we put the head pointer into our next pointer,
> > -		 * and then we fall through to point the head at us.
> >  		 */
> > -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> > -				       0, 0);
> > +		error = xfs_iunlink_update_inode(tp, ip, next_agino,
> > +				&old_agino);
> 
> What's left of the comment above seems a bit misleading wrt to what
> we're doing here. Could we say something like "point this inode to the
> current head of the list" instead of "add us to the front of the list?"

Ok.

> >  		if (error)
> >  			goto out_unlock;
> > -
> > -		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
> > -		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
> > -		offset = ip->i_imap.im_boffset +
> > -			offsetof(xfs_dinode_t, di_next_unlinked);
> > -
> > -		/* need to recalc the inode CRC if appropriate */
> > -		xfs_dinode_calc_crc(mp, dip);
> > -
> > -		xfs_trans_inode_buf(tp, ibp);
> > -		xfs_trans_log_buf(tp, ibp, offset,
> > -				  (offset + sizeof(xfs_agino_t) - 1));
> > -		xfs_inobp_check(mp, ibp);
> > +		ASSERT(old_agino == NULLAGINO);
> >  	}
> >  
> >  	/* Point the head of the list to point to this inode. */
> > @@ -2027,9 +2094,7 @@ xfs_iunlink_remove(
> >  {
> >  	struct xfs_mount	*mp = tp->t_mountp;
> >  	struct xfs_agi		*agi;
> > -	struct xfs_dinode	*dip;
> >  	struct xfs_buf		*agibp;
> > -	struct xfs_buf		*ibp;
> >  	struct xfs_buf		*last_ibp;
> >  	struct xfs_dinode	*last_dip = NULL;
> >  	struct xfs_perag	*pag;
> > @@ -2038,8 +2103,6 @@ xfs_iunlink_remove(
> >  	xfs_agino_t		agino;
> >  	xfs_agino_t		next_agino;
> >  	short			bucket_index;
> > -	int			offset;
> > -	int			last_offset = 0;
> >  	int			error;
> >  
> >  	if (!xfs_verify_ino(mp, ip->i_ino))
> > @@ -2076,34 +2139,11 @@ xfs_iunlink_remove(
> >  		/*
> >  		 * We're at the head of the list.  Get the inode's on-disk
> >  		 * buffer to see if there is anyone after us on the list.
> > -		 * Only modify our next pointer if it is not already NULLAGINO.
> > -		 * This saves us the overhead of dealing with the buffer when
> > -		 * there is no need to change it.
> >  		 */
> > -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> > -				       0, 0);
> > -		if (error) {
> > -			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> > -				__func__, error);
> > +		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
> > +				&next_agino);
> > +		if (error)
> >  			goto out_unlock;
> > -		}
> > -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> > -		ASSERT(next_agino != 0);
> > -		if (next_agino != NULLAGINO) {
> > -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> > -			offset = ip->i_imap.im_boffset +
> > -				offsetof(xfs_dinode_t, di_next_unlinked);
> > -
> > -			/* need to recalc the inode CRC if appropriate */
> > -			xfs_dinode_calc_crc(mp, dip);
> > -
> > -			xfs_trans_inode_buf(tp, ibp);
> > -			xfs_trans_log_buf(tp, ibp, offset,
> > -					  (offset + sizeof(xfs_agino_t) - 1));
> > -			xfs_inobp_check(mp, ibp);
> > -		} else {
> > -			xfs_trans_brelse(tp, ibp);
> > -		}
> >  
> >  		/* Point the head of the list to the next unlinked inode. */
> >  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> > @@ -2111,13 +2151,13 @@ xfs_iunlink_remove(
> >  		if (error)
> >  			goto out_unlock;
> >  	} else {
> > +		struct xfs_imap	imap;
> > +
> >  		/*
> >  		 * We need to search the list for the inode being freed.
> >  		 */
> >  		last_ibp = NULL;
> >  		while (next_agino != agino) {
> > -			struct xfs_imap	imap;
> > -
> >  			if (last_ibp)
> >  				xfs_trans_brelse(tp, last_ibp);
> >  
> > @@ -2141,7 +2181,6 @@ xfs_iunlink_remove(
> >  				goto out_unlock;
> >  			}
> >  
> > -			last_offset = imap.im_boffset;
> >  			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
> >  			if (!xfs_verify_agino(mp, agno, next_agino)) {
> >  				XFS_CORRUPTION_ERROR(__func__,
> > @@ -2156,45 +2195,14 @@ xfs_iunlink_remove(
> >  		 * Now last_ibp points to the buffer previous to us on the
> >  		 * unlinked list.  Pull us from the list.
> >  		 */
> > -		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
> > -				       0, 0);
> > -		if (error) {
> > -			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
> > -				__func__, error);
> > +		error = xfs_iunlink_update_inode(tp, ip, NULLAGINO,
> > +				&next_agino);
> > +		if (error)
> >  			goto out_unlock;
> 
> It looks like the above _update_inode() call is now common across both
> branches. We might be able to factor that out a bit further so the
> if/else just determines whether we update the bucket or a previous
> dinode.

I'll do that, though I think I'll do that as a separate patch.

--D

> Brian
> 
> > -		}
> > -		next_agino = be32_to_cpu(dip->di_next_unlinked);
> > -		ASSERT(next_agino != 0);
> > -		ASSERT(next_agino != agino);
> > -		if (next_agino != NULLAGINO) {
> > -			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
> > -			offset = ip->i_imap.im_boffset +
> > -				offsetof(xfs_dinode_t, di_next_unlinked);
> > -
> > -			/* need to recalc the inode CRC if appropriate */
> > -			xfs_dinode_calc_crc(mp, dip);
> > -
> > -			xfs_trans_inode_buf(tp, ibp);
> > -			xfs_trans_log_buf(tp, ibp, offset,
> > -					  (offset + sizeof(xfs_agino_t) - 1));
> > -			xfs_inobp_check(mp, ibp);
> > -		} else {
> > -			xfs_trans_brelse(tp, ibp);
> > -		}
> > -		/*
> > -		 * Point the previous inode on the list to the next inode.
> > -		 */
> > -		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
> > -		ASSERT(next_agino != 0);
> > -		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
> > -
> > -		/* need to recalc the inode CRC if appropriate */
> > -		xfs_dinode_calc_crc(mp, last_dip);
> >  
> > -		xfs_trans_inode_buf(tp, last_ibp);
> > -		xfs_trans_log_buf(tp, last_ibp, offset,
> > -				  (offset + sizeof(xfs_agino_t) - 1));
> > -		xfs_inobp_check(mp, last_ibp);
> > +		/* Point the previous inode on the list to the next inode. */
> > +		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
> > +				next_ino, next_agino);
> >  	}
> >  	pag->pagi_unlinked_count--;
> >  
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index c10478e7e49a..fbec8f0e1a9a 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
> >  		  __entry->new_ptr)
> >  );
> >  
> > +TRACE_EVENT(xfs_iunlink_update_dinode,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
> > +		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
> > +	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agino_t, agino)
> > +		__field(xfs_agino_t, old_ptr)
> > +		__field(xfs_agino_t, new_ptr)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->agino = agino;
> > +		__entry->old_ptr = old_ptr;
> > +		__entry->new_ptr = new_ptr;
> > +	),
> > +	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->agno,
> > +		  __entry->agino,
> > +		  __entry->old_ptr,
> > +		  __entry->new_ptr)
> > +);
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-02-02 20:46     ` Darrick J. Wong
@ 2019-02-04 13:18       ` Brian Foster
  2019-02-04 16:31         ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Brian Foster @ 2019-02-04 13:18 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Sat, Feb 02, 2019 at 12:46:00PM -0800, Darrick J. Wong wrote:
> On Fri, Feb 01, 2019 at 02:01:38PM -0500, Brian Foster wrote:
> > On Thu, Jan 31, 2019 at 03:18:27PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > There's a loop that searches an unlinked bucket list to find the inode
> > > that points to a given inode.  Hoist this into a separate function;
> > > later we'll use our iunlink backref cache to bypass the slow list
> > > operation.  No functional changes.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_inode.c |  136 ++++++++++++++++++++++++++++++++++++++--------------
> > >  1 file changed, 99 insertions(+), 37 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index ea38b66fbc59..d5b3f8fdac7e 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -2084,6 +2084,100 @@ xfs_iunlink(
> > >  	return error;
> > >  }
> > >  
> > > +/* Return the imap, dinode pointer, and buffer for an inode. */
> > > +STATIC int
> > > +xfs_iunlink_map_ino(
> > > +	struct xfs_trans	*tp,
> > > +	xfs_ino_t		next_ino,
> > 
> > Might be good to clean up some of these variable names as this code gets
> > factored out. E.g., "next_ino" doesn't mean much more than "ino" in the
> > context of this helper.
> 
> Ok.
> 
> > > +	struct xfs_imap		*imap,
> > > +	struct xfs_dinode	**dipp,
> > > +	struct xfs_buf		**bpp)
> > > +{
> > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > +	int			error;
> > > +
> > > +	imap->im_blkno = 0;
> > > +	error = xfs_imap(mp, tp, next_ino, imap, 0);
> > > +	if (error) {
> > > +		xfs_warn(mp, "%s: xfs_imap returned error %d.",
> > > +				__func__, error);
> > > +		return error;
> > > +	}
> > > +
> > > +	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
> > > +	if (error) {
> > > +		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> > > +				__func__, error);
> > > +		return error;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Walk the unlinked chain from @head_agino until we find the inode that
> > > + * points to @target_ino.  Return the inode number, map, dinode pointer,
> > > + * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
> > > + *
> > > + * Do not call this function if @target_agino is the head of the list.
> > > + */
> > > +STATIC int
> > > +xfs_iunlink_map_prev(
> > > +	struct xfs_trans	*tp,
> > > +	struct xfs_perag	*pag,
> > > +	xfs_agino_t		head_agino,
> > > +	xfs_agino_t		target_agino,
> > > +	xfs_ino_t		*ino,
> > > +	struct xfs_imap		*imap,
> > > +	struct xfs_dinode	**dipp,
> > > +	struct xfs_buf		**bpp)
> > > +{
> > 
> > It would also be nice to have some oneline comments next to the params
> > of some of these functions.
> 
> In addition to the description given above the function?
> 

I missed the header comment.. perhaps just a one liner that indicates
ino and everything after are output params would be helpful here. BTW,
s/@target_ino/@target_agino/ in the header comment?

Brian

> > > +	struct xfs_imap		last_imap;
> > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > +	struct xfs_buf		*last_ibp = NULL;
> > > +	struct xfs_dinode	*last_dip;
> > > +	xfs_ino_t		next_ino = NULLFSINO;
> > > +	xfs_agino_t		next_agino;
> > > +	int			error;
> > > +
> > > +	ASSERT(head_agino != target_agino);
> > > +
> > > +	next_agino = head_agino;
> > > +	while (next_agino != target_agino) {
> > > +		xfs_agino_t	unlinked_agino;
> > > +
> > > +		if (last_ibp)
> > > +			xfs_trans_brelse(tp, last_ibp);
> > > +
> > > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > > +				&last_dip, &last_ibp);
> > > +		if (error)
> > > +			return error;
> > > +
> > > +		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > > +		if (!xfs_verify_agino(mp, pag->pag_agno, next_agino) ||
> > 
> > Should the next_agino above be unlinked_agino? Also is the == check
> > below a loop check (comment pls)?
> 
> Yes, fixed.
> 
> /*
>  * Make sure this pointer is valid and isn't an obvious
>  * infinite loop.
>  */
> 
> 
> > > +		    next_agino == unlinked_agino) {
> > > +			XFS_CORRUPTION_ERROR(__func__,
> > > +					XFS_ERRLEVEL_LOW, mp,
> > > +					last_dip, sizeof(*last_dip));
> > > +			error = -EFSCORRUPTED;
> > > +			return error;
> > > +		}
> > > +		next_agino = unlinked_agino;
> > > +	}
> > > +
> > > +	/* Should never happen, but don't return garbage. */
> > > +	if (next_ino == NULLFSINO)
> > > +		return -EFSCORRUPTED;
> > > +
> > > +	*ino = next_ino;
> > > +	memcpy(imap, &last_imap, sizeof(last_imap));
> > > +	*dipp = last_dip;
> > > +	*bpp = last_ibp;
> > > +	return 0;
> > > +}
> > > +
> > >  /*
> > >   * Pull the on-disk inode from the AGI unlinked list.
> > >   */
> > > @@ -2153,43 +2247,11 @@ xfs_iunlink_remove(
> > >  	} else {
> > >  		struct xfs_imap	imap;
> > >  
> > > -		/*
> > > -		 * We need to search the list for the inode being freed.
> > > -		 */
> > > -		last_ibp = NULL;
> > > -		while (next_agino != agino) {
> > > -			if (last_ibp)
> > > -				xfs_trans_brelse(tp, last_ibp);
> > > -
> > > -			imap.im_blkno = 0;
> > > -			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
> > > -
> > > -			error = xfs_imap(mp, tp, next_ino, &imap, 0);
> > > -			if (error) {
> > > -				xfs_warn(mp,
> > > -	"%s: xfs_imap returned error %d.",
> > > -					 __func__, error);
> > > -				goto out_unlock;
> > > -			}
> > > -
> > > -			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> > > -					       &last_ibp, 0, 0);
> > > -			if (error) {
> > > -				xfs_warn(mp,
> > > -	"%s: xfs_imap_to_bp returned error %d.",
> > > -					__func__, error);
> > > -				goto out_unlock;
> > > -			}
> > > -
> > > -			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > > -			if (!xfs_verify_agino(mp, agno, next_agino)) {
> > > -				XFS_CORRUPTION_ERROR(__func__,
> > > -						XFS_ERRLEVEL_LOW, mp,
> > > -						last_dip, sizeof(*last_dip));
> > > -				error = -EFSCORRUPTED;
> > > -				goto out_unlock;
> > > -			}
> > > -		}
> > > +		/* We need to search the list for the inode being freed. */
> > > +		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
> > > +				&next_ino, &imap, &last_dip, &last_ibp);
> > 
> > Another variable nit... next_ino is kind of confusing here, particularly
> > where we still use next_agino as well. The former is actually the "prev"
> > ino output, right?
> 
> Yep, will fix.
> 
> --D
> 
> > Brian
> > 
> > > +		if (error)
> > > +			goto out_unlock;
> > >  
> > >  		/*
> > >  		 * Now last_ibp points to the buffer previous to us on the
> > > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function
  2019-02-04 13:18       ` Brian Foster
@ 2019-02-04 16:31         ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2019-02-04 16:31 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Mon, Feb 04, 2019 at 08:18:45AM -0500, Brian Foster wrote:
> On Sat, Feb 02, 2019 at 12:46:00PM -0800, Darrick J. Wong wrote:
> > On Fri, Feb 01, 2019 at 02:01:38PM -0500, Brian Foster wrote:
> > > On Thu, Jan 31, 2019 at 03:18:27PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > There's a loop that searches an unlinked bucket list to find the inode
> > > > that points to a given inode.  Hoist this into a separate function;
> > > > later we'll use our iunlink backref cache to bypass the slow list
> > > > operation.  No functional changes.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/xfs_inode.c |  136 ++++++++++++++++++++++++++++++++++++++--------------
> > > >  1 file changed, 99 insertions(+), 37 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index ea38b66fbc59..d5b3f8fdac7e 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > > @@ -2084,6 +2084,100 @@ xfs_iunlink(
> > > >  	return error;
> > > >  }
> > > >  
> > > > +/* Return the imap, dinode pointer, and buffer for an inode. */
> > > > +STATIC int
> > > > +xfs_iunlink_map_ino(
> > > > +	struct xfs_trans	*tp,
> > > > +	xfs_ino_t		next_ino,
> > > 
> > > Might be good to clean up some of these variable names as this code gets
> > > factored out. E.g., "next_ino" doesn't mean much more than "ino" in the
> > > context of this helper.
> > 
> > Ok.
> > 
> > > > +	struct xfs_imap		*imap,
> > > > +	struct xfs_dinode	**dipp,
> > > > +	struct xfs_buf		**bpp)
> > > > +{
> > > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > > +	int			error;
> > > > +
> > > > +	imap->im_blkno = 0;
> > > > +	error = xfs_imap(mp, tp, next_ino, imap, 0);
> > > > +	if (error) {
> > > > +		xfs_warn(mp, "%s: xfs_imap returned error %d.",
> > > > +				__func__, error);
> > > > +		return error;
> > > > +	}
> > > > +
> > > > +	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
> > > > +	if (error) {
> > > > +		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
> > > > +				__func__, error);
> > > > +		return error;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Walk the unlinked chain from @head_agino until we find the inode that
> > > > + * points to @target_ino.  Return the inode number, map, dinode pointer,
> > > > + * and inode cluster buffer of that inode as @ino, @imap, @dipp, and @bpp.
> > > > + *
> > > > + * Do not call this function if @target_agino is the head of the list.
> > > > + */
> > > > +STATIC int
> > > > +xfs_iunlink_map_prev(
> > > > +	struct xfs_trans	*tp,
> > > > +	struct xfs_perag	*pag,
> > > > +	xfs_agino_t		head_agino,
> > > > +	xfs_agino_t		target_agino,
> > > > +	xfs_ino_t		*ino,
> > > > +	struct xfs_imap		*imap,
> > > > +	struct xfs_dinode	**dipp,
> > > > +	struct xfs_buf		**bpp)
> > > > +{
> > > 
> > > It would also be nice to have some oneline comments next to the params
> > > of some of these functions.
> > 
> > In addition to the description given above the function?
> > 
> 
> I missed the header comment.. perhaps just a one liner that indicates
> ino and everything after are output params would be helpful here. BTW,
> s/@target_ino/@target_agino/ in the header comment?

Ok, will fix.  Thanks for the review!

--D

> Brian
> 
> > > > +	struct xfs_imap		last_imap;
> > > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > > +	struct xfs_buf		*last_ibp = NULL;
> > > > +	struct xfs_dinode	*last_dip;
> > > > +	xfs_ino_t		next_ino = NULLFSINO;
> > > > +	xfs_agino_t		next_agino;
> > > > +	int			error;
> > > > +
> > > > +	ASSERT(head_agino != target_agino);
> > > > +
> > > > +	next_agino = head_agino;
> > > > +	while (next_agino != target_agino) {
> > > > +		xfs_agino_t	unlinked_agino;
> > > > +
> > > > +		if (last_ibp)
> > > > +			xfs_trans_brelse(tp, last_ibp);
> > > > +
> > > > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > > > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > > > +				&last_dip, &last_ibp);
> > > > +		if (error)
> > > > +			return error;
> > > > +
> > > > +		unlinked_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > > > +		if (!xfs_verify_agino(mp, pag->pag_agno, next_agino) ||
> > > 
> > > Should the next_agino above be unlinked_agino? Also is the == check
> > > below a loop check (comment pls)?
> > 
> > Yes, fixed.
> > 
> > /*
> >  * Make sure this pointer is valid and isn't an obvious
> >  * infinite loop.
> >  */
> > 
> > 
> > > > +		    next_agino == unlinked_agino) {
> > > > +			XFS_CORRUPTION_ERROR(__func__,
> > > > +					XFS_ERRLEVEL_LOW, mp,
> > > > +					last_dip, sizeof(*last_dip));
> > > > +			error = -EFSCORRUPTED;
> > > > +			return error;
> > > > +		}
> > > > +		next_agino = unlinked_agino;
> > > > +	}
> > > > +
> > > > +	/* Should never happen, but don't return garbage. */
> > > > +	if (next_ino == NULLFSINO)
> > > > +		return -EFSCORRUPTED;
> > > > +
> > > > +	*ino = next_ino;
> > > > +	memcpy(imap, &last_imap, sizeof(last_imap));
> > > > +	*dipp = last_dip;
> > > > +	*bpp = last_ibp;
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  /*
> > > >   * Pull the on-disk inode from the AGI unlinked list.
> > > >   */
> > > > @@ -2153,43 +2247,11 @@ xfs_iunlink_remove(
> > > >  	} else {
> > > >  		struct xfs_imap	imap;
> > > >  
> > > > -		/*
> > > > -		 * We need to search the list for the inode being freed.
> > > > -		 */
> > > > -		last_ibp = NULL;
> > > > -		while (next_agino != agino) {
> > > > -			if (last_ibp)
> > > > -				xfs_trans_brelse(tp, last_ibp);
> > > > -
> > > > -			imap.im_blkno = 0;
> > > > -			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
> > > > -
> > > > -			error = xfs_imap(mp, tp, next_ino, &imap, 0);
> > > > -			if (error) {
> > > > -				xfs_warn(mp,
> > > > -	"%s: xfs_imap returned error %d.",
> > > > -					 __func__, error);
> > > > -				goto out_unlock;
> > > > -			}
> > > > -
> > > > -			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
> > > > -					       &last_ibp, 0, 0);
> > > > -			if (error) {
> > > > -				xfs_warn(mp,
> > > > -	"%s: xfs_imap_to_bp returned error %d.",
> > > > -					__func__, error);
> > > > -				goto out_unlock;
> > > > -			}
> > > > -
> > > > -			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
> > > > -			if (!xfs_verify_agino(mp, agno, next_agino)) {
> > > > -				XFS_CORRUPTION_ERROR(__func__,
> > > > -						XFS_ERRLEVEL_LOW, mp,
> > > > -						last_dip, sizeof(*last_dip));
> > > > -				error = -EFSCORRUPTED;
> > > > -				goto out_unlock;
> > > > -			}
> > > > -		}
> > > > +		/* We need to search the list for the inode being freed. */
> > > > +		error = xfs_iunlink_map_prev(tp, pag, next_agino, agino,
> > > > +				&next_ino, &imap, &last_dip, &last_ibp);
> > > 
> > > Another variable nit... next_ino is kind of confusing here, particularly
> > > where we still use next_agino as well. The former is actually the "prev"
> > > ino output, right?
> > 
> > Yep, will fix.
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +		if (error)
> > > > +			goto out_unlock;
> > > >  
> > > >  		/*
> > > >  		 * Now last_ibp points to the buffer previous to us on the
> > > > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2019-02-04 16:32 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-31 23:17 [PATCH 0/8] xfs: incore unlinked list Darrick J. Wong
2019-01-31 23:17 ` [PATCH 1/8] xfs: clean up iunlink functions Darrick J. Wong
2019-02-01  8:01   ` Christoph Hellwig
2019-02-02 19:15     ` Darrick J. Wong
2019-01-31 23:18 ` [PATCH 2/8] xfs: track unlinked inode counts in per-ag data Darrick J. Wong
2019-02-01 18:59   ` Brian Foster
2019-02-01 19:33     ` Darrick J. Wong
2019-02-02 16:14       ` Christoph Hellwig
2019-02-02 19:28         ` Darrick J. Wong
2019-01-31 23:18 ` [PATCH 3/8] xfs: refactor AGI unlinked bucket updates Darrick J. Wong
2019-02-01 19:00   ` Brian Foster
2019-02-02 19:50     ` Darrick J. Wong
2019-02-02 16:21   ` Christoph Hellwig
2019-02-02 19:51     ` Darrick J. Wong
2019-01-31 23:18 ` [PATCH 4/8] xfs: strengthen AGI unlinked inode bucket pointer checks Darrick J. Wong
2019-02-01 19:00   ` Brian Foster
2019-02-02 16:22   ` Christoph Hellwig
2019-01-31 23:18 ` [PATCH 5/8] xfs: refactor inode unlinked pointer update functions Darrick J. Wong
2019-02-01 19:01   ` Brian Foster
2019-02-02 22:00     ` Darrick J. Wong
2019-02-02 16:27   ` Christoph Hellwig
2019-02-02 20:29     ` Darrick J. Wong
2019-01-31 23:18 ` [PATCH 6/8] xfs: hoist unlinked list search and mapping to a separate function Darrick J. Wong
2019-02-01 19:01   ` Brian Foster
2019-02-02 20:46     ` Darrick J. Wong
2019-02-04 13:18       ` Brian Foster
2019-02-04 16:31         ` Darrick J. Wong
2019-02-02 16:30   ` Christoph Hellwig
2019-02-02 20:42     ` Darrick J. Wong
2019-02-02 16:51   ` Christoph Hellwig
2019-01-31 23:18 ` [PATCH 7/8] xfs: add tracepoints for high level iunlink operations Darrick J. Wong
2019-02-01 19:01   ` Brian Foster
2019-02-01 19:14     ` Darrick J. Wong
2019-01-31 23:18 ` [PATCH 8/8] xfs: cache unlinked pointers in an rhashtable Darrick J. Wong
2019-02-01  8:03   ` Christoph Hellwig
2019-02-01 23:59     ` Dave Chinner
2019-02-02  4:31       ` Darrick J. Wong
2019-02-02 16:07         ` Christoph Hellwig
2019-02-01 19:29   ` Brian Foster
2019-02-01 19:40     ` Darrick J. Wong
2019-02-02 17:01   ` Christoph Hellwig
2019-02-01  7:57 ` [PATCH 0/8] xfs: incore unlinked list Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.