Linux-XFS Archive on lore.kernel.org
 help / color / Atom feed
From: Gao Xiang <hsiangkao@redhat.com>
To: linux-xfs@vger.kernel.org
Cc: Dave Chinner <david@fromorbit.com>,
	"Darrick J. Wong" <darrick.wong@oracle.com>,
	Brian Foster <bfoster@redhat.com>,
	Gao Xiang <hsiangkao@redhat.com>
Subject: [RFC PATCH 2/2] xfs: don't access AGI on unlinked inodes if it can
Date: Tue,  7 Jul 2020 21:57:41 +0800
Message-ID: <20200707135741.487-3-hsiangkao@redhat.com> (raw)
In-Reply-To: <20200707135741.487-1-hsiangkao@redhat.com>

Currently, we use AGI buffer lock to protect in-memory linked list for
unlinked inodes but since it's not necessary to modify AGI unless the
head of the unlinked list is modified. So let's removing the AGI buffer
modification dependency if possible, including 1) adding another per-AG
dedicated lock to protect the whole list and 2) inserting unlinked
inodes from tail.

For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
to use. xfs_iunlink_remove() still support old multiple short bucket
lists for recovery code.

Note that some paths take AGI lock in its transaction in advance,
so the proper locking order is only AGI lock -> unlinked list lock.

Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/xfs/xfs_inode.c       | 251 ++++++++++++++++++++-------------------
 fs/xfs/xfs_log_recover.c |   6 +
 fs/xfs/xfs_mount.c       |   3 +
 fs/xfs/xfs_mount.h       |   3 +
 4 files changed, 144 insertions(+), 119 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 10565fa5ace4..d33e5b198534 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1994,182 +1994,195 @@ xfs_iunlink_update_bucket(
 }
 
 /*
- * Always insert at the head, so we only have to do a next inode lookup to
- * update it's prev pointer. The AGI bucket will point at the one we are
- * inserting.
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
  */
-static int
-xfs_iunlink_insert_inode(
+STATIC int
+xfs_iunlink(
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_perag	*pag;
+	struct xfs_inode	*pip;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, agno);
 
 	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
+	 * some paths (e.g. xfs_create_tmpfile) could take AGI lock
+	 * in this transaction in advance and the only locking order
+	 * AGI buf lock -> pag_unlinked_mutex is safe.
 	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[0]);
-	if (next_agino == agino ||
-	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
-		xfs_buf_mark_corrupt(agibp);
-		return -EFSCORRUPTED;
-	}
-
-	ip->i_prev_unlinked = NULLAGINO;
-	ip->i_next_unlinked = next_agino;
-	if (ip->i_next_unlinked != NULLAGINO) {
-		struct xfs_inode	*nip;
+	mutex_lock(&pag->pag_unlinked_mutex);
+	pip = pag->pag_unlinked_tail;
+	if (!pip) {
+		struct xfs_buf	*agibp;
 
-		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
-		if (IS_ERR_OR_NULL(nip))
-			return -EFSCORRUPTED;
+		mutex_unlock(&pag->pag_unlinked_mutex);
 
-		if (nip->i_prev_unlinked != NULLAGINO) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+		/*
+		 * there could be some race, but it doesn't matter though
+		 * since !pip doesn't happen frequently.
+		 */
+		error = xfs_read_agi(mp, tp, agno, &agibp);
+		if (error) {
+			xfs_perag_put(pag);
+			return error;
 		}
-		nip->i_prev_unlinked = agino;
 
-		/* update the on disk inode now */
-		xfs_iunlink_log(tp, ip);
-	}
+		mutex_lock(&pag->pag_unlinked_mutex);
+		pip = pag->pag_unlinked_tail;
+		if (!pip) {
+			struct xfs_agi	*agi;
+
+			ip->i_prev_unlinked = NULLAGINO;
 
-	/* Point the head of the list to point to this inode. */
-	return xfs_iunlink_update_bucket(tp, agno, agibp, next_agino, agino);
+			agi = agibp->b_addr;
+			ASSERT(be32_to_cpu(agi->agi_unlinked[0]) == NULLAGINO);
+
+			/* Point the head of the list to point to this inode. */
+			error = xfs_iunlink_update_bucket(tp, agno,
+					agibp, NULLAGINO, agino);
+			goto out;
+		}
+	}
+	ip->i_prev_unlinked = XFS_INO_TO_AGINO(mp, pip->i_ino);
+	ASSERT(pip->i_next_unlinked == NULLAGINO);
+	pip->i_next_unlinked = agino;
+	xfs_iunlink_log(tp, pip);
+out:
+	ip->i_next_unlinked = NULLAGINO;
+	pag->pag_unlinked_tail = ip;
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
+	return 0;
 }
 
 /*
+ * Pull the on-disk inode from the AGI unlinked list.
+ *
  * Remove can be from anywhere in the list, so we have to do two adjacent inode
  * lookups here so we can update list pointers. We may be at the head or the
  * tail of the list, so we have to handle those cases as well.
  */
-static int
-xfs_iunlink_remove_inode(
+STATIC int
+xfs_iunlink_remove(
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_perag	*pag;
+	struct xfs_inode	*pip = NULL;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino = ip->i_next_unlinked;
+	xfs_agino_t		next_agino;
 	int			error;
 
+	trace_xfs_iunlink_remove(ip);
+
+	pag = xfs_perag_get(mp, agno);
+	mutex_lock(&pag->pag_unlinked_mutex);
+
+	/* see the comment in xfs_iunlink() on the only proper locking order */
 	if (ip->i_prev_unlinked == NULLAGINO) {
-		/* remove from head of list */
-		if (next_agino == agino ||
-		    !xfs_verify_agino_or_null(mp, agno, next_agino))
-			return -EFSCORRUPTED;
+		struct xfs_buf	*agibp;
 
-		error = xfs_iunlink_update_bucket(tp, agno, agibp, agino, next_agino);
-		if (error)
+		mutex_unlock(&pag->pag_unlinked_mutex);
+
+		error = xfs_read_agi(mp, tp, agno, &agibp);
+		if (error) {
+			xfs_perag_put(pag);
 			return error;
-	} else {
-		/* lookup previous inode and update to point at next */
-		struct xfs_inode	*pip;
+		}
 
-		pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip);
-		if (IS_ERR_OR_NULL(pip))
-			return -EFSCORRUPTED;
+		mutex_lock(&pag->pag_unlinked_mutex);
+		if (ip->i_prev_unlinked == NULLAGINO) {
+			struct xfs_agi	*agi;
 
-		if (pip->i_next_unlinked != agino) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+			next_agino = ip->i_next_unlinked;
+			agi = agibp->b_addr;
+
+			/* remove from head of list */
+			if (next_agino == agino ||
+			    !xfs_verify_agino_or_null(mp, agno, next_agino))
+				goto err_fscorrupted;
+
+			error = xfs_iunlink_update_bucket(tp, agno,
+				agibp, agino, next_agino);
+			if (error)
+				goto err_out;
+
+			goto next_fixup;
 		}
+	}
 
-		/* update the on disk inode now */
-		pip->i_next_unlinked = next_agino;
-		xfs_iunlink_log(tp, pip);
+	next_agino = ip->i_next_unlinked;
+
+	/* lookup previous inode and update to point at next */
+	pip = xfs_iunlink_lookup_prev(pag, ip);
+	if (IS_ERR_OR_NULL(pip))
+		goto err_fscorrupted;
+
+	if (pip->i_next_unlinked != agino) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+					NULL, 0, __this_address);
+		goto err_fscorrupted;
 	}
+	pip->i_next_unlinked = next_agino;
+	xfs_iunlink_log(tp, pip);
 
-	/* lookup the next inode and update to point at prev */
-	if (ip->i_next_unlinked != NULLAGINO) {
+next_fixup:
+	if (next_agino == NULLAGINO) {
+		/* so iunlink recovery can work here */
+		if (pag->pag_unlinked_tail == ip)
+			pag->pag_unlinked_tail = pip;
+	} else {
+		/* lookup the next inode and update to point at prev */
 		struct xfs_inode	*nip;
 
-		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
+		nip = xfs_iunlink_lookup_next(pag, ip);
 		if (IS_ERR_OR_NULL(nip))
-			return -EFSCORRUPTED;
+			goto err_fscorrupted;
 
 		if (nip->i_prev_unlinked != agino) {
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+			goto err_fscorrupted;
 		}
 		/* in memory update only */
 		nip->i_prev_unlinked = ip->i_prev_unlinked;
 	}
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
 
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	/*
 	 * Now clear prev/next from this inode and update on disk if we
 	 * need to clear the on-disk link.
 	 */
 	ip->i_prev_unlinked = NULLAGINO;
-	ip->i_next_unlinked = NULLAGINO;
-	if (next_agino != NULLAGINO)
+	if (next_agino != NULLAGINO) {
+		ip->i_next_unlinked = NULLAGINO;
 		xfs_iunlink_log(tp, ip);
+	}
 	return 0;
-}
 
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI.  It will be pulled from this
- * list when the inode is freed.
- */
-STATIC int
-xfs_iunlink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agibp;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	int			error;
-
-	ASSERT(ip->i_next_unlinked == NULLAGINO);
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
-	trace_xfs_iunlink(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, agno, &agibp);
-	if (error)
-		return error;
-
-	return xfs_iunlink_insert_inode(tp, agibp, ip);
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-STATIC int
-xfs_iunlink_remove(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agibp;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	int			error;
-
-	trace_xfs_iunlink_remove(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, agno, &agibp);
-	if (error)
-		return error;
-
-	return xfs_iunlink_remove_inode(tp, agibp, ip);
+err_fscorrupted:
+	error = -EFSCORRUPTED;
+err_out:
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d47eea31c165..3f2739316424 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2801,6 +2801,11 @@ xlog_recover_iunlink_ag(
 					prev_ip->i_next_unlinked = NULLAGINO;
 				break;
 			}
+
+			/* XXX: take pag_unlinked_mutex across the loop? */
+			if (!bucket)
+				agibp->b_pag->pag_unlinked_tail = ip;
+
 			if (prev_ip) {
 				ip->i_prev_unlinked = prev_agino;
 				xfs_irele(prev_ip);
@@ -2812,6 +2817,7 @@ xlog_recover_iunlink_ag(
 		}
 		if (prev_ip)
 			xfs_irele(prev_ip);
+
 		if (error) {
 			/*
 			 * We can't read an inode this bucket points to, or an
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 031e96ff022d..a9701f863e84 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -223,6 +223,9 @@ xfs_initialize_perag(
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
 		spin_lock_init(&pag->pag_state_lock);
+
+		mutex_init(&pag->pag_unlinked_mutex);
+		pag->pag_unlinked_tail = NULL;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a72cfcaa4ad1..107a634a34f7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -372,6 +372,9 @@ typedef struct xfs_perag {
 	/* reference count */
 	uint8_t			pagf_refcount_level;
 
+	struct mutex		pag_unlinked_mutex;
+	struct xfs_inode	*pag_unlinked_tail;
+
 	/*
 	 * Unlinked inode information.  This incore information reflects
 	 * data stored in the AGI, so callers must hold the AGI buffer lock
-- 
2.18.1


  parent reply index

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-07 13:57 [RFC PATCH 0/2] xfs: more unlinked inode list optimization v1 Gao Xiang
2020-07-07 13:57 ` [RFC PATCH 1/2] xfs: arrange all unlinked inodes into one list Gao Xiang
2020-07-08 22:33   ` Dave Chinner
2020-07-09  0:17     ` Gao Xiang
2020-07-07 13:57 ` Gao Xiang [this message]
2020-07-08 17:03   ` [RFC PATCH 2/2] xfs: don't access AGI on unlinked inodes if it can Darrick J. Wong
2020-07-08 23:40     ` Gao Xiang
2020-07-08 23:33   ` Dave Chinner
2020-07-09  0:55     ` Gao Xiang
2020-07-09  2:32       ` Dave Chinner
2020-07-09 10:36         ` Gao Xiang
2020-07-09 10:47           ` Gao Xiang
2020-07-09 22:36           ` Dave Chinner
2020-07-24  6:12 ` [RFC PATCH v2 0/3] xfs: more unlinked inode list optimization v2 Gao Xiang
2020-07-24  6:12   ` [RFC PATCH v2 1/3] xfs: arrange all unlinked inodes into one list Gao Xiang
2020-07-24  6:12   ` [RFC PATCH v2 2/3] xfs: introduce perag iunlink lock Gao Xiang
2020-07-24  6:12   ` [RFC PATCH v2 3/3] xfs: insert unlinked inodes from tail Gao Xiang
2020-08-18 13:30   ` [RFC PATCH v4 0/3] xfs: more unlinked inode list optimization v4 Gao Xiang
2020-08-18 13:30     ` [RFC PATCH v4 1/3] xfs: get rid of unused pagi_unlinked_hash Gao Xiang
2020-08-19  0:54       ` Darrick J. Wong
2020-08-21  1:09         ` Dave Chinner
2020-08-18 13:30     ` [RFC PATCH v4 2/3] xfs: introduce perag iunlink lock Gao Xiang
2020-08-19  1:06       ` Darrick J. Wong
2020-08-19  1:23         ` Gao Xiang
2020-08-18 13:30     ` [RFC PATCH v4 3/3] xfs: insert unlinked inodes from tail Gao Xiang
2020-08-19  0:53     ` [RFC PATCH v4 0/3] xfs: more unlinked inode list optimization v4 Darrick J. Wong
2020-08-19  1:14       ` Gao Xiang
2020-08-20  2:46     ` Darrick J. Wong
2020-08-20  4:01       ` Gao Xiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200707135741.487-3-hsiangkao@redhat.com \
    --to=hsiangkao@redhat.com \
    --cc=bfoster@redhat.com \
    --cc=darrick.wong@oracle.com \
    --cc=david@fromorbit.com \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-XFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-xfs/0 linux-xfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-xfs linux-xfs/ https://lore.kernel.org/linux-xfs \
		linux-xfs@vger.kernel.org
	public-inbox-index linux-xfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-xfs


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git