Linux-BTRFS Archive on lore.kernel.org
 help / Atom feed
* [PATCH] btrfs: honor path->skip_locking in backref code
@ 2019-01-16 16:00 Josef Bacik
  2019-01-17  1:04 ` Qu Wenruo
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Josef Bacik @ 2019-01-16 16:00 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

qgroups will do the old roots lookup at delayed ref time, which could be
while walking down the extent root while running a delayed ref.  This
should be fine, except we specifically lock eb's in the backref walking
code irrespective of path->skip_locking, which deadlocks the system.
Fix up the backref code to honor path->skip_locking, nobody will be
modifying the commit_root when we're searching so it's completely safe
to do.  Thanks,

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/backref.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78556447e1d5..973e8251b1bf 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -712,7 +712,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct preftrees *preftrees)
+			    struct preftrees *preftrees, bool lock)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			free_extent_buffer(eb);
 			return -EIO;
 		}
-		btrfs_tree_read_lock(eb);
+		if (lock)
+			btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-		btrfs_tree_read_unlock(eb);
+		if (lock)
+			btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
@@ -1227,7 +1229,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees);
+	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
@@ -1288,11 +1290,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 					ret = -EIO;
 					goto out;
 				}
-				btrfs_tree_read_lock(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_lock(eb);
 				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie, ignore_offset);
-				btrfs_tree_read_unlock_blocking(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
-- 
2.14.3


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-16 16:00 [PATCH] btrfs: honor path->skip_locking in backref code Josef Bacik
@ 2019-01-17  1:04 ` Qu Wenruo
  2019-01-17 14:30   ` David Sterba
  2019-01-18 13:39 ` David Sterba
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2019-01-17  1:04 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team

[-- Attachment #1.1: Type: text/plain, Size: 2738 bytes --]



On 2019/1/17 上午12:00, Josef Bacik wrote:
> qgroups will do the old roots lookup at delayed ref time, which could be
> while walking down the extent root while running a delayed ref.  This
> should be fine, except we specifically lock eb's in the backref walking
> code irrespective of path->skip_locking, which deadlocks the system.
> Fix up the backref code to honor path->skip_locking, nobody will be
> modifying the commit_root when we're searching so it's completely safe
> to do.  Thanks,
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> ---
>  fs/btrfs/backref.c | 16 ++++++++++------
>  1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> index 78556447e1d5..973e8251b1bf 100644
> --- a/fs/btrfs/backref.c
> +++ b/fs/btrfs/backref.c
> @@ -712,7 +712,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>   * read tree blocks and add keys where required.
>   */
>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
> -			    struct preftrees *preftrees)
> +			    struct preftrees *preftrees, bool lock)
>  {
>  	struct prelim_ref *ref;
>  	struct extent_buffer *eb;
> @@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
>  			free_extent_buffer(eb);
>  			return -EIO;
>  		}
> -		btrfs_tree_read_lock(eb);
> +		if (lock)
> +			btrfs_tree_read_lock(eb);
>  		if (btrfs_header_level(eb) == 0)
>  			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
>  		else
>  			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
> -		btrfs_tree_read_unlock(eb);
> +		if (lock)
> +			btrfs_tree_read_unlock(eb);
>  		free_extent_buffer(eb);
>  		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
>  		cond_resched();
> @@ -1227,7 +1229,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  
>  	btrfs_release_path(path);
>  
> -	ret = add_missing_keys(fs_info, &preftrees);
> +	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
>  	if (ret)
>  		goto out;
>  
> @@ -1288,11 +1290,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  					ret = -EIO;
>  					goto out;
>  				}
> -				btrfs_tree_read_lock(eb);
> +				if (!path->skip_locking)
> +					btrfs_tree_read_lock(eb);
>  				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
>  				ret = find_extent_in_eb(eb, bytenr,
>  							*extent_item_pos, &eie, ignore_offset);
> -				btrfs_tree_read_unlock_blocking(eb);
> +				if (!path->skip_locking)
> +					btrfs_tree_read_unlock_blocking(eb);
>  				free_extent_buffer(eb);
>  				if (ret < 0)
>  					goto out;
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-17  1:04 ` Qu Wenruo
@ 2019-01-17 14:30   ` David Sterba
  2019-01-17 14:38     ` Qu Wenruo
  0 siblings, 1 reply; 14+ messages in thread
From: David Sterba @ 2019-01-17 14:30 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Thu, Jan 17, 2019 at 09:04:13AM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/1/17 上午12:00, Josef Bacik wrote:
> > qgroups will do the old roots lookup at delayed ref time, which could be
> > while walking down the extent root while running a delayed ref.  This
> > should be fine, except we specifically lock eb's in the backref walking
> > code irrespective of path->skip_locking, which deadlocks the system.
> > Fix up the backref code to honor path->skip_locking, nobody will be
> > modifying the commit_root when we're searching so it's completely safe
> > to do.  Thanks,
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> 
> Reviewed-by: Qu Wenruo <wqu@suse.com>

What can we use from https://patchwork.kernel.org/patch/10725371/ to
add to the changelog? Is the deadlock caused by fb235dc06fac? The fix
applies to stable 4.19+ directly and to 4.14 with context fixups.
Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-17 14:30   ` David Sterba
@ 2019-01-17 14:38     ` Qu Wenruo
  2019-01-17 15:23       ` David Sterba
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2019-01-17 14:38 UTC (permalink / raw)
  To: dsterba, Josef Bacik, linux-btrfs, kernel-team

[-- Attachment #1.1: Type: text/plain, Size: 1140 bytes --]



On 2019/1/17 下午10:30, David Sterba wrote:
> On Thu, Jan 17, 2019 at 09:04:13AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2019/1/17 上午12:00, Josef Bacik wrote:
>>> qgroups will do the old roots lookup at delayed ref time, which could be
>>> while walking down the extent root while running a delayed ref.  This
>>> should be fine, except we specifically lock eb's in the backref walking
>>> code irrespective of path->skip_locking, which deadlocks the system.
>>> Fix up the backref code to honor path->skip_locking, nobody will be
>>> modifying the commit_root when we're searching so it's completely safe
>>> to do.  Thanks,
>>>
>>> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
>>
>> Reviewed-by: Qu Wenruo <wqu@suse.com>
> 
> What can we use from https://patchwork.kernel.org/patch/10725371/ to
> add to the changelog? Is the deadlock caused by fb235dc06fac?

The error and cause part should be the same.

Just Josef's is much better than my patch.

Only the [FIX] part needs some update.

Thanks
Qu

> The fix
> applies to stable 4.19+ directly and to 4.14 with context fixups.
> Thanks.
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-17 14:38     ` Qu Wenruo
@ 2019-01-17 15:23       ` David Sterba
  0 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2019-01-17 15:23 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Josef Bacik, linux-btrfs, kernel-team

On Thu, Jan 17, 2019 at 10:38:26PM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/1/17 下午10:30, David Sterba wrote:
> > On Thu, Jan 17, 2019 at 09:04:13AM +0800, Qu Wenruo wrote:
> >>
> >>
> >> On 2019/1/17 上午12:00, Josef Bacik wrote:
> >>> qgroups will do the old roots lookup at delayed ref time, which could be
> >>> while walking down the extent root while running a delayed ref.  This
> >>> should be fine, except we specifically lock eb's in the backref walking
> >>> code irrespective of path->skip_locking, which deadlocks the system.
> >>> Fix up the backref code to honor path->skip_locking, nobody will be
> >>> modifying the commit_root when we're searching so it's completely safe
> >>> to do.  Thanks,
> >>>
> >>> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> >>
> >> Reviewed-by: Qu Wenruo <wqu@suse.com>
> > 
> > What can we use from https://patchwork.kernel.org/patch/10725371/ to
> > add to the changelog? Is the deadlock caused by fb235dc06fac?
> 
> The error and cause part should be the same.

Ok, thanks I've copied the stacktrace and deadlock analysis and tags.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-16 16:00 [PATCH] btrfs: honor path->skip_locking in backref code Josef Bacik
  2019-01-17  1:04 ` Qu Wenruo
@ 2019-01-18 13:39 ` David Sterba
  2019-01-23 13:51 ` David Sterba
  2019-02-12  5:07 ` Qu Wenruo
  3 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2019-01-18 13:39 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Wed, Jan 16, 2019 at 11:00:57AM -0500, Josef Bacik wrote:
> qgroups will do the old roots lookup at delayed ref time, which could be
> while walking down the extent root while running a delayed ref.  This
> should be fine, except we specifically lock eb's in the backref walking
> code irrespective of path->skip_locking, which deadlocks the system.
> Fix up the backref code to honor path->skip_locking, nobody will be
> modifying the commit_root when we're searching so it's completely safe
> to do.  Thanks,
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Overnight test of the untar/snap/snapdel/rm stress test is fine now,
there's a lockdep warning in the logs that I think have seen already.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-16 16:00 [PATCH] btrfs: honor path->skip_locking in backref code Josef Bacik
  2019-01-17  1:04 ` Qu Wenruo
  2019-01-18 13:39 ` David Sterba
@ 2019-01-23 13:51 ` David Sterba
  2019-02-12  5:07 ` Qu Wenruo
  3 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2019-01-23 13:51 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Wed, Jan 16, 2019 at 11:00:57AM -0500, Josef Bacik wrote:
> qgroups will do the old roots lookup at delayed ref time, which could be
> while walking down the extent root while running a delayed ref.  This
> should be fine, except we specifically lock eb's in the backref walking
> code irrespective of path->skip_locking, which deadlocks the system.
> Fix up the backref code to honor path->skip_locking, nobody will be
> modifying the commit_root when we're searching so it's completely safe
> to do.  Thanks,
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

So this explodes at btrfs/007, I had the patch in a separate branch out of
misc-next and for-next and did not notice that, the functional tests passed but
fstests do not.

[   90.693679] kernel BUG at fs/btrfs/locking.c:286!
[   90.694914] invalid opcode: 0000 [#1] PREEMPT SMP
[   90.696152] CPU: 2 PID: 22976 Comm: btrfs Not tainted 4.20.0-rc7-default+ #419
[   90.698100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
[   90.700893] RIP: 0010:btrfs_assert_tree_read_locked.part.2+0x5/0x10 [btrfs]
[   90.706111] RSP: 0018:ffffa1b046447840 EFLAGS: 00010246
[   90.707035] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   90.707987] RDX: 0000000069fc7000 RSI: 0000000000000002 RDI: ffff91c73784f480
[   90.709962] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[   90.711894] R10: ffff91c73ffc6000 R11: 0000000001a7f1c0 R12: ffff91c723d8dc60
[   90.713543] R13: ffff91c73784f480 R14: 0000000000000001 R15: ffff91c723d8daa8
[   90.715023] FS:  00007fdedfe778c0(0000) GS:ffff91c73d800000(0000) knlGS:0000000000000000
[   90.717170] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   90.718176] CR2: 00007fdedf58ae38 CR3: 0000000073d09000 CR4: 00000000000006e0
[   90.719112] Call Trace:
[   90.720810]  btrfs_set_lock_blocking_rw+0x9d/0xb0 [btrfs]
[   90.722187]  find_parent_nodes+0xaf5/0xf30 [btrfs]
[   90.723102]  ? __find_iref+0xc0/0xc0 [btrfs]
[   90.723798]  iterate_extent_inodes+0xf6/0x320 [btrfs]
[   90.724574]  ? btrfs_get_token_64+0x103/0x120 [btrfs]
[   90.725700]  ? kvm_sched_clock_read+0x5/0x10
[   90.726688]  ? sched_clock+0x5/0x10
[   90.727695]  ? process_extent+0x503/0x1210 [btrfs]
[   90.728899]  process_extent+0x503/0x1210 [btrfs]
[   90.729910]  ? __record_deleted_ref+0x20/0x20 [btrfs]
[   90.731130]  changed_cb+0xb9e/0x1060 [btrfs]
[   90.732105]  send_subvol+0x4a0/0x580 [btrfs]
[   90.733142]  btrfs_ioctl_send+0x7c4/0xb20 [btrfs]
[   90.734188]  ? sched_clock_cpu+0xc/0xc0
[   90.735076]  ? _copy_from_user+0x63/0x90
[   90.736004]  _btrfs_ioctl_send+0xdd/0x110 [btrfs]
[   90.737077]  ? __lock_is_held+0x5a/0xa0
[   90.737962]  btrfs_ioctl+0xcd3/0x2f10 [btrfs]
[   90.738931]  ? __lock_acquire+0x263/0xf10
[   90.740025]  ? lockdep_hardirqs_on+0xe8/0x180
[   90.741082]  ? do_vfs_ioctl+0xa2/0x6d0
[   90.742077]  do_vfs_ioctl+0xa2/0x6d0
[   90.743151]  ? __fget+0x109/0x1e0
[   90.744117]  ksys_ioctl+0x3a/0x70
[   90.744988]  __x64_sys_ioctl+0x16/0x20
[   90.746035]  do_syscall_64+0x54/0x180
[   90.746997]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   90.748090] RIP: 0033:0x7fdedf67daa7

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-01-16 16:00 [PATCH] btrfs: honor path->skip_locking in backref code Josef Bacik
                   ` (2 preceding siblings ...)
  2019-01-23 13:51 ` David Sterba
@ 2019-02-12  5:07 ` Qu Wenruo
  2019-02-12 14:19   ` David Sterba
  3 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2019-02-12  5:07 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team

[-- Attachment #1.1: Type: text/plain, Size: 2964 bytes --]



On 2019/1/17 上午12:00, Josef Bacik wrote:
> qgroups will do the old roots lookup at delayed ref time, which could be
> while walking down the extent root while running a delayed ref.  This
> should be fine, except we specifically lock eb's in the backref walking
> code irrespective of path->skip_locking, which deadlocks the system.
> Fix up the backref code to honor path->skip_locking, nobody will be
> modifying the commit_root when we're searching so it's completely safe
> to do.  Thanks,
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/backref.c | 16 ++++++++++------
>  1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> index 78556447e1d5..973e8251b1bf 100644
> --- a/fs/btrfs/backref.c
> +++ b/fs/btrfs/backref.c
> @@ -712,7 +712,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>   * read tree blocks and add keys where required.
>   */
>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
> -			    struct preftrees *preftrees)
> +			    struct preftrees *preftrees, bool lock)
>  {
>  	struct prelim_ref *ref;
>  	struct extent_buffer *eb;
> @@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
>  			free_extent_buffer(eb);
>  			return -EIO;
>  		}
> -		btrfs_tree_read_lock(eb);
> +		if (lock)
> +			btrfs_tree_read_lock(eb);
>  		if (btrfs_header_level(eb) == 0)
>  			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
>  		else
>  			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
> -		btrfs_tree_read_unlock(eb);
> +		if (lock)
> +			btrfs_tree_read_unlock(eb);
>  		free_extent_buffer(eb);
>  		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
>  		cond_resched();
> @@ -1227,7 +1229,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  
>  	btrfs_release_path(path);
>  
> -	ret = add_missing_keys(fs_info, &preftrees);
> +	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
>  	if (ret)
>  		goto out;
>  
> @@ -1288,11 +1290,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  					ret = -EIO;
>  					goto out;
>  				}
> -				btrfs_tree_read_lock(eb);
> +				if (!path->skip_locking)
> +					btrfs_tree_read_lock(eb);
>  				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);

This btrfs_set_lock_blocking_rw() or the btrfs_set_lock_blocking_read()
in latest misc-next call need @eb to be read locked first.

So this line should also be in the (!path->skip_locking) branch, and
such modification solves the BUG_ON() caused by btrfs/007.

Thanks,
Qu

>  				ret = find_extent_in_eb(eb, bytenr,
>  							*extent_item_pos, &eie, ignore_offset);
> -				btrfs_tree_read_unlock_blocking(eb);
> +				if (!path->skip_locking)
> +					btrfs_tree_read_unlock_blocking(eb);
>  				free_extent_buffer(eb);
>  				if (ret < 0)
>  					goto out;
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-02-12  5:07 ` Qu Wenruo
@ 2019-02-12 14:19   ` David Sterba
  2019-02-12 14:25     ` Qu Wenruo
  0 siblings, 1 reply; 14+ messages in thread
From: David Sterba @ 2019-02-12 14:19 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Tue, Feb 12, 2019 at 01:07:19PM +0800, Qu Wenruo wrote:
> > @@ -1288,11 +1290,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
> >  					ret = -EIO;
> >  					goto out;
> >  				}
> > -				btrfs_tree_read_lock(eb);
> > +				if (!path->skip_locking)
> > +					btrfs_tree_read_lock(eb);
> >  				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
> 
> This btrfs_set_lock_blocking_rw() or the btrfs_set_lock_blocking_read()
> in latest misc-next call need @eb to be read locked first.
> 
> So this line should also be in the (!path->skip_locking) branch, and
> such modification solves the BUG_ON() caused by btrfs/007.

Thanks.  btrfs_set_lock_blocking_rw is gone from misc-next so the patch
needs a refresh and resend, besides passing fstests of course.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-02-12 14:19   ` David Sterba
@ 2019-02-12 14:25     ` Qu Wenruo
  0 siblings, 0 replies; 14+ messages in thread
From: Qu Wenruo @ 2019-02-12 14:25 UTC (permalink / raw)
  To: dsterba, Josef Bacik, linux-btrfs, kernel-team

[-- Attachment #1.1: Type: text/plain, Size: 1098 bytes --]



On 2019/2/12 下午10:19, David Sterba wrote:
> On Tue, Feb 12, 2019 at 01:07:19PM +0800, Qu Wenruo wrote:
>>> @@ -1288,11 +1290,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>>>  					ret = -EIO;
>>>  					goto out;
>>>  				}
>>> -				btrfs_tree_read_lock(eb);
>>> +				if (!path->skip_locking)
>>> +					btrfs_tree_read_lock(eb);
>>>  				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
>>
>> This btrfs_set_lock_blocking_rw() or the btrfs_set_lock_blocking_read()
>> in latest misc-next call need @eb to be read locked first.
>>
>> So this line should also be in the (!path->skip_locking) branch, and
>> such modification solves the BUG_ON() caused by btrfs/007.
> 
> Thanks.  btrfs_set_lock_blocking_rw is gone from misc-next so the patch
> needs a refresh and resend, besides passing fstests of course.
> 
Yup, although the conflict is pretty small, just change the
btrfs_set_lock_blocking_rw() to btrfs_set_lock_blocking_read() will do it.

Does Josef or me need to resend the patch or would you mind to fold the
change?

Thanks,
Qu


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-05-28 10:18 Qu Wenruo
@ 2019-05-28 16:10 ` David Sterba
  0 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2019-05-28 16:10 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, Josef Bacik, stable, David Sterba, Filipe Manana

On Tue, May 28, 2019 at 06:18:35PM +0800, Qu Wenruo wrote:
> CC: stable@vger.kernel.org # 4.19

This is for 4.19 stable tree.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: honor path->skip_locking in backref code
  2019-05-28 10:15 Qu Wenruo
@ 2019-05-28 16:10 ` David Sterba
  0 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2019-05-28 16:10 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, Josef Bacik, stable, David Sterba, Filipe Manana

On Tue, May 28, 2019 at 06:15:52PM +0800, Qu Wenruo wrote:
> From: Josef Bacik <josef@toxicpanda.com>
> CC: stable@vger.kernel.org # 4.14

This applies to 4.14 stable tree.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] btrfs: honor path->skip_locking in backref code
@ 2019-05-28 10:18 Qu Wenruo
  2019-05-28 16:10 ` David Sterba
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2019-05-28 10:18 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Josef Bacik, stable, David Sterba, Filipe Manana

From: Josef Bacik <josef@toxicpanda.com>

Commit 38e3eebff643db725633657d1d87a3be019d1018.

Qgroups will do the old roots lookup at delayed ref time, which could be
while walking down the extent root while running a delayed ref.  This
should be fine, except we specifically lock eb's in the backref walking
code irrespective of path->skip_locking, which deadlocks the system.
Fix up the backref code to honor path->skip_locking, nobody will be
modifying the commit_root when we're searching so it's completely safe
to do.

This happens since fb235dc06fac ("btrfs: qgroup: Move half of the qgroup
accounting time out of commit trans"), kernel may lockup with quota
enabled.

There is one backref trace triggered by snapshot dropping along with
write operation in the source subvolume.  The example can be reliably
reproduced:

  btrfs-cleaner   D    0  4062      2 0x80000000
  Call Trace:
   schedule+0x32/0x90
   btrfs_tree_read_lock+0x93/0x130 [btrfs]
   find_parent_nodes+0x29b/0x1170 [btrfs]
   btrfs_find_all_roots_safe+0xa8/0x120 [btrfs]
   btrfs_find_all_roots+0x57/0x70 [btrfs]
   btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs]
   btrfs_qgroup_trace_leaf_items+0x10b/0x140 [btrfs]
   btrfs_qgroup_trace_subtree+0xc8/0xe0 [btrfs]
   do_walk_down+0x541/0x5e3 [btrfs]
   walk_down_tree+0xab/0xe7 [btrfs]
   btrfs_drop_snapshot+0x356/0x71a [btrfs]
   btrfs_clean_one_deleted_snapshot+0xb8/0xf0 [btrfs]
   cleaner_kthread+0x12b/0x160 [btrfs]
   kthread+0x112/0x130
   ret_from_fork+0x27/0x50

When dropping snapshots with qgroup enabled, we will trigger backref
walk.

However such backref walk at that timing is pretty dangerous, as if one
of the parent nodes get WRITE locked by other thread, we could cause a
dead lock.

For example:

           FS 260     FS 261 (Dropped)
            node A        node B
           /      \      /      \
       node C      node D      node E
      /   \         /  \        /     \
  leaf F|leaf G|leaf H|leaf I|leaf J|leaf K

The lock sequence would be:

      Thread A (cleaner)             |       Thread B (other writer)
-----------------------------------------------------------------------
write_lock(B)                        |
write_lock(D)                        |
^^^ called by walk_down_tree()       |
                                     |       write_lock(A)
                                     |       write_lock(D) << Stall
read_lock(H) << for backref walk     |
read_lock(D) << lock owner is        |
                the same thread A    |
                so read lock is OK   |
read_lock(A) << Stall                |

So thread A hold write lock D, and needs read lock A to unlock.
While thread B holds write lock A, while needs lock D to unlock.

This will cause a deadlock.

This is not only limited to snapshot dropping case.  As the backref
walk, even only happens on commit trees, is breaking the normal top-down
locking order, makes it deadlock prone.

Fixes: fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting time out of commit trans")
CC: stable@vger.kernel.org # 4.19
Reported-and-tested-by: David Sterba <dsterba@suse.com>
Reported-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[ rebase to latest branch and fix lock assert bug in btrfs/007 ]
[ backport to linux-4.19.y branch, solve minor conflicts ]
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ copy logs and deadlock analysis from Qu's patch ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2a4f52c7be22..ac6c383d6314 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -710,7 +710,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct preftrees *preftrees)
+			    struct preftrees *preftrees, bool lock)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
@@ -735,12 +735,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			free_extent_buffer(eb);
 			return -EIO;
 		}
-		btrfs_tree_read_lock(eb);
+		if (lock)
+			btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-		btrfs_tree_read_unlock(eb);
+		if (lock)
+			btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
@@ -1225,7 +1227,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees);
+	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
@@ -1286,11 +1288,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 					ret = -EIO;
 					goto out;
 				}
-				btrfs_tree_read_lock(eb);
-				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				if (!path->skip_locking) {
+					btrfs_tree_read_lock(eb);
+					btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie, ignore_offset);
-				btrfs_tree_read_unlock_blocking(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] btrfs: honor path->skip_locking in backref code
@ 2019-05-28 10:15 Qu Wenruo
  2019-05-28 16:10 ` David Sterba
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2019-05-28 10:15 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Josef Bacik, stable, David Sterba, Filipe Manana

From: Josef Bacik <josef@toxicpanda.com>

Commit 38e3eebff643db725633657d1d87a3be019d1018.

Qgroups will do the old roots lookup at delayed ref time, which could be
while walking down the extent root while running a delayed ref.  This
should be fine, except we specifically lock eb's in the backref walking
code irrespective of path->skip_locking, which deadlocks the system.
Fix up the backref code to honor path->skip_locking, nobody will be
modifying the commit_root when we're searching so it's completely safe
to do.

This happens since fb235dc06fac ("btrfs: qgroup: Move half of the qgroup
accounting time out of commit trans"), kernel may lockup with quota
enabled.

There is one backref trace triggered by snapshot dropping along with
write operation in the source subvolume.  The example can be reliably
reproduced:

  btrfs-cleaner   D    0  4062      2 0x80000000
  Call Trace:
   schedule+0x32/0x90
   btrfs_tree_read_lock+0x93/0x130 [btrfs]
   find_parent_nodes+0x29b/0x1170 [btrfs]
   btrfs_find_all_roots_safe+0xa8/0x120 [btrfs]
   btrfs_find_all_roots+0x57/0x70 [btrfs]
   btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs]
   btrfs_qgroup_trace_leaf_items+0x10b/0x140 [btrfs]
   btrfs_qgroup_trace_subtree+0xc8/0xe0 [btrfs]
   do_walk_down+0x541/0x5e3 [btrfs]
   walk_down_tree+0xab/0xe7 [btrfs]
   btrfs_drop_snapshot+0x356/0x71a [btrfs]
   btrfs_clean_one_deleted_snapshot+0xb8/0xf0 [btrfs]
   cleaner_kthread+0x12b/0x160 [btrfs]
   kthread+0x112/0x130
   ret_from_fork+0x27/0x50

When dropping snapshots with qgroup enabled, we will trigger backref
walk.

However such backref walk at that timing is pretty dangerous, as if one
of the parent nodes get WRITE locked by other thread, we could cause a
dead lock.

For example:

           FS 260     FS 261 (Dropped)
            node A        node B
           /      \      /      \
       node C      node D      node E
      /   \         /  \        /     \
  leaf F|leaf G|leaf H|leaf I|leaf J|leaf K

The lock sequence would be:

      Thread A (cleaner)             |       Thread B (other writer)
-----------------------------------------------------------------------
write_lock(B)                        |
write_lock(D)                        |
^^^ called by walk_down_tree()       |
                                     |       write_lock(A)
                                     |       write_lock(D) << Stall
read_lock(H) << for backref walk     |
read_lock(D) << lock owner is        |
                the same thread A    |
                so read lock is OK   |
read_lock(A) << Stall                |

So thread A hold write lock D, and needs read lock A to unlock.
While thread B holds write lock A, while needs lock D to unlock.

This will cause a deadlock.

This is not only limited to snapshot dropping case.  As the backref
walk, even only happens on commit trees, is breaking the normal top-down
locking order, makes it deadlock prone.

Fixes: fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting time out of commit trans")
CC: stable@vger.kernel.org # 4.14
Reported-and-tested-by: David Sterba <dsterba@suse.com>
Reported-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[ rebase to latest branch and fix lock assert bug in btrfs/007 ]
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ copy logs and deadlock analysis from Qu's patch ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 196503d8c993..e4d5e6eae409 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -718,7 +718,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct preftrees *preftrees)
+			    struct preftrees *preftrees, bool lock)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
@@ -742,12 +742,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			free_extent_buffer(eb);
 			return -EIO;
 		}
-		btrfs_tree_read_lock(eb);
+		if (lock)
+			btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-		btrfs_tree_read_unlock(eb);
+		if (lock)
+			btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
@@ -1228,7 +1230,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees);
+	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
@@ -1288,11 +1290,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 					ret = -EIO;
 					goto out;
 				}
-				btrfs_tree_read_lock(eb);
-				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				if (!path->skip_locking) {
+					btrfs_tree_read_lock(eb);
+					btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie);
-				btrfs_tree_read_unlock_blocking(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, back to index

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-16 16:00 [PATCH] btrfs: honor path->skip_locking in backref code Josef Bacik
2019-01-17  1:04 ` Qu Wenruo
2019-01-17 14:30   ` David Sterba
2019-01-17 14:38     ` Qu Wenruo
2019-01-17 15:23       ` David Sterba
2019-01-18 13:39 ` David Sterba
2019-01-23 13:51 ` David Sterba
2019-02-12  5:07 ` Qu Wenruo
2019-02-12 14:19   ` David Sterba
2019-02-12 14:25     ` Qu Wenruo
2019-05-28 10:15 Qu Wenruo
2019-05-28 16:10 ` David Sterba
2019-05-28 10:18 Qu Wenruo
2019-05-28 16:10 ` David Sterba

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox