All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [Bug report] overlayfs over xfs whiteout operation may cause deadlock
       [not found] <CABRboy006NP8JrxuBgEJbfCcGGUY2Kucwfov+HJf2xW34D5Ocg@mail.gmail.com>
@ 2020-12-11 23:42 ` Darrick J. Wong
       [not found]   ` <CABRboy35_tyxA3gHN7_=xp0_RVugQjvFOHCRsH4Y4rrivE7HmQ@mail.gmail.com>
  0 siblings, 1 reply; 3+ messages in thread
From: Darrick J. Wong @ 2020-12-11 23:42 UTC (permalink / raw)
  To: wenli xie; +Cc: linux-xfs

On Thu, Dec 10, 2020 at 03:14:53PM +0800, wenli xie wrote:
> We are using the linux ubuntu kernel for 5.4.0-34.generic.x86_64. In our
> host, there are many containers use the same base image. All of them have
> `mv /etc/security/access.conf  /etc/security/access.conf.bak` operation
> when the container starts up. Sometimes we met that the `mv` process came
> to D state and couldn't recover.
> I did some research and find this issue can be reproduced by following
> steps.
> 
> 1. mount a disk to /var/mnt/test with xfs fs.
>  mount -t xfs /mnt/test
> 2. Create lower dir under /var/mnt/test
>    /var/mnt/test/lowdir
>    /var/mnt/test/lowdir1
> 3. Create `/etc/access.conf` in dir `/var/mnt/test/lowdir`
> 4.
> Run following script in different terminals
> 1)
> while true
> do
>   umount /var/mnt/test/merged
>   rm -rf /var/mnt/test/merged
>   rm -rf /var/mnt/test/upperdir
>   rm -rf /var/mnt/test/workdir
>   mkdir /var/mnt/test/merged
>   mkdir /var/mnt/test/workdir
>   mkdir /var/mnt/test/upperdir
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir,workdir=/var/mnt/test/workdir
> /var/mnt/test/merged
>   mv /var/mnt/test/merged/etc/access.conf
> /var/mnt/test/merged/etc/access.conf.bak
>   touch /var/mnt/test/merged/etc/access.conf
>   mv /var/mnt/test/merged/etc/access.conf
> /var/mnt/test/merged/etc/access.conf.bak
>   touch /var/mnt/test/merged/etc/access.conf
>   echo "---------------"
> done
> 
> 2)
> while true
> do
>   umount /var/mnt/test/merged1
>   rm -rf /var/mnt/test/merged1
>   rm -rf /var/mnt/test/upperdir1
>   rm -rf /var/mnt/test/workdir1
>   mkdir /var/mnt/test/merged1
>   mkdir /var/mnt/test/workdir1
>   mkdir /var/mnt/test/upperdir1
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir1,workdir=/var/mnt/test/workdir1
> /var/mnt/test/merged1
>   mv /var/mnt/test/merged1/etc/access.conf
> /var/mnt/test/merged1/etc/access.conf.bak
>   touch /var/mnt/test/merged1/etc/access.conf
>   mv /var/mnt/test/merged1/etc/access.conf
> /var/mnt/test/merged1/etc/access.conf.bak
>   touch /var/mnt/test/merged1/etc/access.conf
>   echo "---------------"
> done
> 
> 3)
> while true
> do
>   umount /var/mnt/test/merged3
>   rm -rf /var/mnt/test/merged3
>   rm -rf /var/mnt/test/upperdir3
>   rm -rf /var/mnt/test/workdir3
>   mkdir /var/mnt/test/merged3
>   mkdir /var/mnt/test/workdir3
>   mkdir /var/mnt/test/upperdir3
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir3,workdir=/var/mnt/test/workdir3
> /var/mnt/test/merged3
>   mv /var/mnt/test/merged3/etc/access.conf
> /var/mnt/test/merged3/etc/access.conf.bak
>   touch /var/mnt/test/merged3/etc/access.conf
>   mv /var/mnt/test/merged3/etc/access.conf
> /var/mnt/test/merged3/etc/access.conf.bak
>   touch /var/mnt/test/merged3/etc/access.conf
>   echo "---------------"
> done
> 
> 4)
> while true
> do
>   umount /var/mnt/test/merged4
>   rm -rf /var/mnt/test/merged4
>   rm -rf /var/mnt/test/upperdir4
>   rm -rf /var/mnt/test/workdir4
>   mkdir /var/mnt/test/merged4
>   mkdir /var/mnt/test/workdir4
>   mkdir /var/mnt/test/upperdir4
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir4,workdir=/var/mnt/test/workdir4
> /var/mnt/test/merged4
>   mv /var/mnt/test/merged4/etc/access.conf
> /var/mnt/test/merged4/etc/access.conf.bak
>   touch /var/mnt/test/merged4/etc/access.conf
>   mv /var/mnt/test/merged4/etc/access.conf
> /var/mnt/test/merged4/etc/access.conf.bak
>   touch /var/mnt/test/merged4/etc/access.conf
>   echo "---------------"
> done
> 
> 5)
> while true
> do
>   umount /var/mnt/test/merged5
>   rm -rf /var/mnt/test/merged5
>   rm -rf /var/mnt/test/upperdir5
>   rm -rf /var/mnt/test/workdir5
>   mkdir /var/mnt/test/merged5
>   mkdir /var/mnt/test/workdir5
>   mkdir /var/mnt/test/upperdir5
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir5,workdir=/var/mnt/test/workdir5
> /var/mnt/test/merged5
>   mv /var/mnt/test/merged5/etc/access.conf
> /var/mnt/test/merged5/etc/access.conf.bak
>   touch /var/mnt/test/merged5/etc/access.conf
>   mv /var/mnt/test/merged5/etc/access.conf
> /var/mnt/test/merged5/etc/access.conf.bak
>   touch /var/mnt/test/merged5/etc/access.conf
>   echo "---------------"
> done
> 
> 6)
> while true
> do
>   umount /var/mnt/test/merged6
>   rm -rf /var/mnt/test/merged6
>   rm -rf /var/mnt/test/upperdir6
>   rm -rf /var/mnt/test/workdir6
>   mkdir /var/mnt/test/merged6
>   mkdir /var/mnt/test/workdir6
>   mkdir /var/mnt/test/upperdir6
>   mount -t overlay overlay -o
> lowerdir=/var/mnt/test/lowerdir:/var/mnt/test/lowerdir1,upperdir=/var/mnt/test/upperdir6,workdir=/var/mnt/test/workdir6
> /var/mnt/test/merged6
>   mv /var/mnt/test/merged6/etc/access.conf
> /var/mnt/test/merged6/etc/access.conf.bak
>   touch /var/mnt/test/merged6/etc/access.conf
>   mv /var/mnt/test/merged6/etc/access.conf
> /var/mnt/test/merged6/etc/access.conf.bak
>   touch /var/mnt/test/merged6/etc/access.conf
>   echo "---------------"
> done
> 
> 
> 
> After a while, you will found that there are `mv` process become to D state
> 
> crash> foreach UN ps -m|sort|tail
> [0 00:00:23.271] [UN]  PID: 38033  TASK: ffff908cd4c8ba80  CPU: 20
>  COMMAND: "mv"
> [0 00:00:23.332] [UN]  PID: 38029  TASK: ffff908cd6093a80  CPU: 1
> COMMAND: "mv"
> [0 00:00:23.332] [UN]  PID: 38037  TASK: ffff908cd606ba80  CPU: 31
>  COMMAND: "touch"
> [0 00:00:23.332] [UN]  PID: 38038  TASK: ffff908cd4c50000  CPU: 24
>  COMMAND: "mv"
> [0 00:00:23.333] [UN]  PID: 38032  TASK: ffff908cd5a0ba80  CPU: 33
>  COMMAND: "mv"
> [0 00:00:23.333] [UN]  PID: 38035  TASK: ffff908cd5110000  CPU: 8
> COMMAND: "touch"
> [0 00:00:23.336] [UN]  PID: 38040  TASK: ffff908cd4cbba80  CPU: 2
> COMMAND: "touch"
> [0 00:00:23.337] [UN]  PID: 38039  TASK: ffff908cd62cba80  CPU: 35
>  COMMAND: "touch"
> [0 00:00:23.338] [UN]  PID: 38030  TASK: ffff908cdbfc0000  CPU: 15
>  COMMAND: "mv"
> [0 00:00:23.339] [UN]  PID: 38036  TASK: ffff908cd4ce8000  CPU: 22
>  COMMAND: "mv"
> 
> 
> The `mv` process have the same stack like:
> 
> crash> bt
> PID: 38029  TASK: ffff908cd6093a80  CPU: 1   COMMAND: "mv"
>  #0 [ffffa848961b3558] __schedule at ffffffff8fa96c09
>  #1 [ffffa848961b35e8] schedule at ffffffff8fa97235
>  #2 [ffffa848961b3608] schedule_timeout at ffffffff8fa9ccc6
>  #3 [ffffa848961b36c8] __down_common at ffffffff8fa9b2b3
>  #4 [ffffa848961b3748] __down at ffffffff8fa9b311
>  #5 [ffffa848961b3758] down at ffffffff8ed43be1
>  #6 [ffffa848961b3778] xfs_buf_lock at ffffffffc06fab78 [xfs]
>  #7 [ffffa848961b37a8] xfs_buf_find at ffffffffc06fb160 [xfs]
>  #8 [ffffa848961b3850] xfs_buf_get_map at ffffffffc06fbf41 [xfs]
>  #9 [ffffa848961b38a0] xfs_buf_read_map at ffffffffc06fcd67 [xfs]
> #10 [ffffa848961b38f0] xfs_trans_read_buf_map at ffffffffc0752496 [xfs]
> #11 [ffffa848961b3938] xfs_read_agi at ffffffffc06cdf02 [xfs]
> #12 [ffffa848961b39a0] xfs_iunlink at ffffffffc071c741 [xfs]
> #13 [ffffa848961b3a08] xfs_droplink at ffffffffc071c9e2 [xfs]
> #14 [ffffa848961b3a30] xfs_rename at ffffffffc07221b7 [xfs]
> #15 [ffffa848961b3b08] xfs_vn_rename at ffffffffc0717ec4 [xfs]
> #16 [ffffa848961b3b80] vfs_rename at ffffffff8eff9d65
> #17 [ffffa848961b3c40] ovl_do_rename at ffffffffc091d177 [overlay]
> #18 [ffffa848961b3c78] ovl_rename at ffffffffc091e885 [overlay]
> #19 [ffffa848961b3d10] vfs_rename at ffffffff8eff9d65
> #20 [ffffa848961b3dd8] do_renameat2 at ffffffff8effcc66
> #21 [ffffa848961b3ea8] __x64_sys_rename at ffffffff8effcdb9
> #22 [ffffa848961b3ec0] do_syscall_64 at ffffffff8ec04c54
> #23 [ffffa848961b3f50] entry_SYSCALL_64_after_hwframe at ffffffff8fc00091
>     RIP: 00007f84ed829da7  RSP: 00007ffe6da90508  RFLAGS: 00000202
>     RAX: ffffffffffffffda  RBX: 00007ffe6da9093f  RCX: 00007f84ed829da7
>     RDX: 0000000000000000  RSI: 00007ffe6da92b94  RDI: 00007ffe6da92b7a
>     RBP: 00007ffe6da908e0   R8: 0000000000000001   R9: 0000000000000000
>     R10: 0000000000000001  R11: 0000000000000202  R12: 00007ffe6da909c0
>     R13: 0000000000000000  R14: 0000000000000000  R15: 00007ffe6da92b7a
>     ORIG_RAX: 0000000000000052  CS: 0033  SS: 002b
> 
> 
> I revert this PR
> https://github.com/torvalds/linux/commit/fa6c668d807b1e9ac041101dfcb59bd8e279cfe5
> , which can indicate which process is hold the lock. This can help for the
> debug.
> 
> 1. Check for process 38036
> 
> crash> set 38036
>     PID: 38036
> COMMAND: "mv"
>    TASK: ffff908cd4ce8000  [THREAD_INFO: ffff908cd4ce8000]
>     CPU: 22
>   STATE: TASK_UNINTERRUPTIBLE
> 
> 
> It is waiting for a lock, which is holding by 38029:
> 
> crash> struct xfs_buf.b_last_holder ffff908f92a0a680
>   b_last_holder = 38029
> 
> 
> 2. Check for process 38029
> crash> set 38029
>     PID: 38029
> COMMAND: "mv"
>    TASK: ffff908cd6093a80  [THREAD_INFO: ffff908cd6093a80]
>     CPU: 1
>   STATE: TASK_UNINTERRUPTIBLE
> 
> crash> struct xfs_buf.b_last_holder ffff908f993cc780
>   b_last_holder = 38030
> 
> It is waiting for a lock, which is holding  by 38030
> 
> 3. Check for process 38030
> crash> set 38030
>     PID: 38030
> COMMAND: "mv"
>    TASK: ffff908cdbfc0000  [THREAD_INFO: ffff908cdbfc0000]
>     CPU: 15
>   STATE: TASK_UNINTERRUPTIBLE
> 
> crash> struct xfs_buf.b_last_holder ffff908f92a0a680
>   b_last_holder = 38029
> 
> 
> 
> I did a bit more trace by ebpf, the lock ffff908f92a0a680 should be held by
> 38029 in xfs_iunlink_remove() but not released:
> 
>   b'xfs_buf_trylock+0x1'
>   b'xfs_buf_get_map+0x51'
>   b'xfs_buf_read_map+0x47'
>   b'xfs_trans_read_buf_map+0xf6'
>   b'xfs_read_agi+0xd2'
>   b'xfs_iunlink_remove+0x9a'
>   b'xfs_rename+0x618'
>   b'xfs_vn_rename+0x104'
>   b'vfs_rename+0x6e5'
>   b'ovl_do_rename+0x47'
>   b'ovl_rename+0x5d5'
>   b'vfs_rename+0x6e5'
>   b'do_renameat2+0x576'
>   b'__x64_sys_rename+0x29'
>   b'do_syscall_64+0x84'
>   b'entry_SYSCALL_64_after_hwframe+0x49'
> 
> The lock 0xffff908f993cc780 should also be held by 38030
> xfs_iunlink_remove() but not released:
> 
>   b'xfs_buf_trylock+0x1'
>   b'xfs_buf_get_map+0x51'
>   b'xfs_buf_read_map+0x47'
>   b'xfs_trans_read_buf_map+0xf6'
>   b'xfs_read_agi+0xd2'
>   b'xfs_iunlink_remove+0x9a'
>   b'xfs_rename+0x618'
>   b'xfs_vn_rename+0x104'
>   b'vfs_rename+0x6e5'
>   b'ovl_do_rename+0x47'
>   b'ovl_rename+0x5d5'
>   b'vfs_rename+0x6e5'
>   b'do_renameat2+0x576'
>   b'__x64_sys_rename+0x29'
>   b'do_syscall_64+0x84'
>   b'entry_SYSCALL_64_after_hwframe+0x49'
> 
> 
> Looks like there are ABBA deadlock in this scenario.
> 
> This issue is very easy to be reproduced by the scripts I provided.

Hm.  Does this happen on upstream?  There have been a couple of deadlock
fixes for the rename code in the past couple of years.

93597ae8dac0149b5c00b787cba6bf7ba213e666 (5.5)
bc56ad8c74b8588685c2875de0df8ab6974828ef (5.4)

I have no idea if Ubuntu has picked up the newer one, you'll have to
contact them.

(Did this ever show up on linux-xfs?)

--D

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [Bug report] overlayfs over xfs whiteout operation may cause deadlock
       [not found]   ` <CABRboy35_tyxA3gHN7_=xp0_RVugQjvFOHCRsH4Y4rrivE7HmQ@mail.gmail.com>
@ 2020-12-17 21:11     ` Darrick J. Wong
  2020-12-22  1:29       ` wenli xie
  0 siblings, 1 reply; 3+ messages in thread
From: Darrick J. Wong @ 2020-12-17 21:11 UTC (permalink / raw)
  To: wenli xie; +Cc: xfs

On Tue, Dec 15, 2020 at 08:44:27PM +0800, wenli xie wrote:
> I tried upstream kernel 5.10 to do the test, and this issue still  can be
> reproduced.

Thanks for the report, I've condensed this down to the following:

#!/bin/bash

SCRATCH_MNT=/mnt
LOAD_FACTOR=1
TIME_FACTOR=1

mkfs.xfs -f /dev/sda
mount /dev/sda $SCRATCH_MNT

mkdir $SCRATCH_MNT/lowerdir
mkdir $SCRATCH_MNT/lowerdir1
mkdir $SCRATCH_MNT/lowerdir/etc
mkdir $SCRATCH_MNT/workers
echo salts > $SCRATCH_MNT/lowerdir/etc/access.conf
touch $SCRATCH_MNT/running

stop_workers() {
	test -e $SCRATCH_MNT/running || return
	rm -f $SCRATCH_MNT/running

	while [ "$(ls $SCRATCH_MNT/workers/ | wc -l)" -gt 0 ]; do
		wait
	done
}

worker() {
	local tag="$1"
	local mergedir="$SCRATCH_MNT/merged$tag"
	local l="lowerdir=$SCRATCH_MNT/lowerdir:$SCRATCH_MNT/lowerdir1"
	local u="upperdir=$SCRATCH_MNT/upperdir$tag"
	local w="workdir=$SCRATCH_MNT/workdir$tag"
	local i="index=off"

	touch $SCRATCH_MNT/workers/$tag
	while test -e $SCRATCH_MNT/running; do
		rm -rf $SCRATCH_MNT/merged$tag
		rm -rf $SCRATCH_MNT/upperdir$tag
		rm -rf $SCRATCH_MNT/workdir$tag
		mkdir $SCRATCH_MNT/merged$tag
		mkdir $SCRATCH_MNT/workdir$tag
		mkdir $SCRATCH_MNT/upperdir$tag

		mount -t overlay overlay -o "$l,$u,$w,$i" $mergedir
		mv $mergedir/etc/access.conf $mergedir/etc/access.conf.bak
		touch $mergedir/etc/access.conf
		mv $mergedir/etc/access.conf $mergedir/etc/access.conf.bak
		touch $mergedir/etc/access.conf
		umount $mergedir
	done
	rm -f $SCRATCH_MNT/workers/$tag
}

for i in $(seq 0 $((4 + LOAD_FACTOR)) ); do
	worker $i &
done

sleep $((30 * TIME_FACTOR))
stop_workers

...and I think this is enough to diagnose the deadlock.

This is an ABBA deadlock caused by locking the AGI buffers in the wrong
order.  Specifically, we seem to be calling xfs_dir_rename with a
non-null @wip and a non-null @target_ip.  In the deadlock scenario, @wip
is an inode in AG 2, and @target_ip is an inode in AG 0 with nlink==1.

First we call xfs_iunlink_remove to remove @wip from the unlinked list,
which causes us to lock AGI 2.  Next we replace the directory entry.
Finally, we need to droplink @target_ip.  Since @target_ip has nlink==1,
xfs_droplink will need to put it on AGI 0's unlinked list.

Unfortunately, the locking rules say that you can only lock AGIs in
increasing order.  This means that we cannot lock AGI 0 after locking
AGI 2 without risking deadlock.

Does the attached patch fix the deadlock for you?

--D

From: Darrick J. Wong <darrick.wong@oracle.com>
Subject: [PATCH] xfs: fix an ABBA deadlock in xfs_rename

When overlayfs is running on top of xfs and the user unlinks a file in
the overlay, overlayfs will create a whiteout inode and ask xfs to
"rename" the whiteout file atop the one being unlinked.  If the file
being unlinked loses its one nlink, we then have to put the inode on the
unlinked list.

This requires us to grab the AGI buffer of the whiteout inode to take it
off the unlinked list (which is where whiteouts are created) and to grab
the AGI buffer of the file being deleted.  If the whiteout was created
in a higher numbered AG than the file being deleted, we'll lock the AGIs
in the wrong order and deadlock.

Therefore, grab all the AGI locks we think we'll need ahead of time, and
in the correct order.

Reported-by: wenli xie <wlxie7296@gmail.com>
Fixes: 93597ae8dac0 ("xfs: Fix deadlock between AGI and AGF when target_ip exists in xfs_rename()")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b7352bc4c815..dd419a1bc6ba 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3000,6 +3000,48 @@ xfs_rename_alloc_whiteout(
 	return 0;
 }
 
+/*
+ * For the general case of renaming files, lock all the AGI buffers we need to
+ * handle bumping the nlink of the whiteout inode off the unlinked list and to
+ * handle dropping the nlink of the target inode.  We have to do this in
+ * increasing AG order to avoid deadlocks.
+ */
+static int
+xfs_rename_lock_agis(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*wip,
+	struct xfs_inode	*target_ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp;
+	xfs_agnumber_t		agi_locks[2] = { NULLAGNUMBER, NULLAGNUMBER };
+	int			error;
+
+	if (wip)
+		agi_locks[0] = XFS_INO_TO_AGNO(mp, wip->i_ino);
+
+	if (target_ip && VFS_I(target_ip)->i_nlink == 1)
+		agi_locks[1] = XFS_INO_TO_AGNO(mp, target_ip->i_ino);
+
+	if (agi_locks[0] != NULLAGNUMBER && agi_locks[1] != NULLAGNUMBER &&
+	    agi_locks[0] > agi_locks[1])
+		swap(agi_locks[0], agi_locks[1]);
+
+	if (agi_locks[0] != NULLAGNUMBER) {
+		error = xfs_read_agi(mp, tp, agi_locks[0], &bp);
+		if (error)
+			return error;
+	}
+
+	if (agi_locks[1] != NULLAGNUMBER) {
+		error = xfs_read_agi(mp, tp, agi_locks[1], &bp);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
 /*
  * xfs_rename
  */
@@ -3130,6 +3172,10 @@ xfs_rename(
 		}
 	}
 
+	error = xfs_rename_lock_agis(tp, wip, target_ip);
+	if (error)
+		return error;
+
 	/*
 	 * Directory entry creation below may acquire the AGF. Remove
 	 * the whiteout from the unlinked list first to preserve correct

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [Bug report] overlayfs over xfs whiteout operation may cause deadlock
  2020-12-17 21:11     ` Darrick J. Wong
@ 2020-12-22  1:29       ` wenli xie
  0 siblings, 0 replies; 3+ messages in thread
From: wenli xie @ 2020-12-22  1:29 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

Thanks Darrick
This patch works fine.  I did some tests for about 3 days, this issue
should be fixed.


During the test, I did 'ps -aux|grep mv'  to get the processes' status.
I can see  some `mv` processes come to 'D' state and then disappear.
So this patch may impact the rename operation's performance of overlay and xfs.
I'd like to have a test, any suggestions?
For overlay, this is fine because the container's rootfs shouldn't
have many rename calls.
What I am most worried about is XFS.




On Fri, Dec 18, 2020 at 5:11 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> On Tue, Dec 15, 2020 at 08:44:27PM +0800, wenli xie wrote:
> > I tried upstream kernel 5.10 to do the test, and this issue still  can be
> > reproduced.
>
> Thanks for the report, I've condensed this down to the following:
>
> #!/bin/bash
>
> SCRATCH_MNT=/mnt
> LOAD_FACTOR=1
> TIME_FACTOR=1
>
> mkfs.xfs -f /dev/sda
> mount /dev/sda $SCRATCH_MNT
>
> mkdir $SCRATCH_MNT/lowerdir
> mkdir $SCRATCH_MNT/lowerdir1
> mkdir $SCRATCH_MNT/lowerdir/etc
> mkdir $SCRATCH_MNT/workers
> echo salts > $SCRATCH_MNT/lowerdir/etc/access.conf
> touch $SCRATCH_MNT/running
>
> stop_workers() {
>         test -e $SCRATCH_MNT/running || return
>         rm -f $SCRATCH_MNT/running
>
>         while [ "$(ls $SCRATCH_MNT/workers/ | wc -l)" -gt 0 ]; do
>                 wait
>         done
> }
>
> worker() {
>         local tag="$1"
>         local mergedir="$SCRATCH_MNT/merged$tag"
>         local l="lowerdir=$SCRATCH_MNT/lowerdir:$SCRATCH_MNT/lowerdir1"
>         local u="upperdir=$SCRATCH_MNT/upperdir$tag"
>         local w="workdir=$SCRATCH_MNT/workdir$tag"
>         local i="index=off"
>
>         touch $SCRATCH_MNT/workers/$tag
>         while test -e $SCRATCH_MNT/running; do
>                 rm -rf $SCRATCH_MNT/merged$tag
>                 rm -rf $SCRATCH_MNT/upperdir$tag
>                 rm -rf $SCRATCH_MNT/workdir$tag
>                 mkdir $SCRATCH_MNT/merged$tag
>                 mkdir $SCRATCH_MNT/workdir$tag
>                 mkdir $SCRATCH_MNT/upperdir$tag
>
>                 mount -t overlay overlay -o "$l,$u,$w,$i" $mergedir
>                 mv $mergedir/etc/access.conf $mergedir/etc/access.conf.bak
>                 touch $mergedir/etc/access.conf
>                 mv $mergedir/etc/access.conf $mergedir/etc/access.conf.bak
>                 touch $mergedir/etc/access.conf
>                 umount $mergedir
>         done
>         rm -f $SCRATCH_MNT/workers/$tag
> }
>
> for i in $(seq 0 $((4 + LOAD_FACTOR)) ); do
>         worker $i &
> done
>
> sleep $((30 * TIME_FACTOR))
> stop_workers
>
> ...and I think this is enough to diagnose the deadlock.
>
> This is an ABBA deadlock caused by locking the AGI buffers in the wrong
> order.  Specifically, we seem to be calling xfs_dir_rename with a
> non-null @wip and a non-null @target_ip.  In the deadlock scenario, @wip
> is an inode in AG 2, and @target_ip is an inode in AG 0 with nlink==1.
>
> First we call xfs_iunlink_remove to remove @wip from the unlinked list,
> which causes us to lock AGI 2.  Next we replace the directory entry.
> Finally, we need to droplink @target_ip.  Since @target_ip has nlink==1,
> xfs_droplink will need to put it on AGI 0's unlinked list.
>
> Unfortunately, the locking rules say that you can only lock AGIs in
> increasing order.  This means that we cannot lock AGI 0 after locking
> AGI 2 without risking deadlock.
>
> Does the attached patch fix the deadlock for you?
>
> --D
>
> From: Darrick J. Wong <darrick.wong@oracle.com>
> Subject: [PATCH] xfs: fix an ABBA deadlock in xfs_rename
>
> When overlayfs is running on top of xfs and the user unlinks a file in
> the overlay, overlayfs will create a whiteout inode and ask xfs to
> "rename" the whiteout file atop the one being unlinked.  If the file
> being unlinked loses its one nlink, we then have to put the inode on the
> unlinked list.
>
> This requires us to grab the AGI buffer of the whiteout inode to take it
> off the unlinked list (which is where whiteouts are created) and to grab
> the AGI buffer of the file being deleted.  If the whiteout was created
> in a higher numbered AG than the file being deleted, we'll lock the AGIs
> in the wrong order and deadlock.
>
> Therefore, grab all the AGI locks we think we'll need ahead of time, and
> in the correct order.
>
> Reported-by: wenli xie <wlxie7296@gmail.com>
> Fixes: 93597ae8dac0 ("xfs: Fix deadlock between AGI and AGF when target_ip exists in xfs_rename()")
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c |   46 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
>
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b7352bc4c815..dd419a1bc6ba 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3000,6 +3000,48 @@ xfs_rename_alloc_whiteout(
>         return 0;
>  }
>
> +/*
> + * For the general case of renaming files, lock all the AGI buffers we need to
> + * handle bumping the nlink of the whiteout inode off the unlinked list and to
> + * handle dropping the nlink of the target inode.  We have to do this in
> + * increasing AG order to avoid deadlocks.
> + */
> +static int
> +xfs_rename_lock_agis(
> +       struct xfs_trans        *tp,
> +       struct xfs_inode        *wip,
> +       struct xfs_inode        *target_ip)
> +{
> +       struct xfs_mount        *mp = tp->t_mountp;
> +       struct xfs_buf          *bp;
> +       xfs_agnumber_t          agi_locks[2] = { NULLAGNUMBER, NULLAGNUMBER };
> +       int                     error;
> +
> +       if (wip)
> +               agi_locks[0] = XFS_INO_TO_AGNO(mp, wip->i_ino);
> +
> +       if (target_ip && VFS_I(target_ip)->i_nlink == 1)
> +               agi_locks[1] = XFS_INO_TO_AGNO(mp, target_ip->i_ino);
> +
> +       if (agi_locks[0] != NULLAGNUMBER && agi_locks[1] != NULLAGNUMBER &&
> +           agi_locks[0] > agi_locks[1])
> +               swap(agi_locks[0], agi_locks[1]);
> +
> +       if (agi_locks[0] != NULLAGNUMBER) {
> +               error = xfs_read_agi(mp, tp, agi_locks[0], &bp);
> +               if (error)
> +                       return error;
> +       }
> +
> +       if (agi_locks[1] != NULLAGNUMBER) {
> +               error = xfs_read_agi(mp, tp, agi_locks[1], &bp);
> +               if (error)
> +                       return error;
> +       }
> +
> +       return 0;
> +}
> +
>  /*
>   * xfs_rename
>   */
> @@ -3130,6 +3172,10 @@ xfs_rename(
>                 }
>         }
>
> +       error = xfs_rename_lock_agis(tp, wip, target_ip);
> +       if (error)
> +               return error;
> +
>         /*
>          * Directory entry creation below may acquire the AGF. Remove
>          * the whiteout from the unlinked list first to preserve correct

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-12-22  1:30 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CABRboy006NP8JrxuBgEJbfCcGGUY2Kucwfov+HJf2xW34D5Ocg@mail.gmail.com>
2020-12-11 23:42 ` [Bug report] overlayfs over xfs whiteout operation may cause deadlock Darrick J. Wong
     [not found]   ` <CABRboy35_tyxA3gHN7_=xp0_RVugQjvFOHCRsH4Y4rrivE7HmQ@mail.gmail.com>
2020-12-17 21:11     ` Darrick J. Wong
2020-12-22  1:29       ` wenli xie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.