linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 4.19 036/167] Btrfs: clean up scrub is_dev_replace parameter
       [not found] <20190903162519.7136-1-sashal@kernel.org>
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 037/167] Btrfs: fix deadlock with memory reclaim during scrub Sasha Levin
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Omar Sandoval, David Sterba, Sasha Levin, linux-btrfs

From: Omar Sandoval <osandov@fb.com>

[ Upstream commit 32934280967d00dc2b5c4d3b63b21a9c8638326e ]

struct scrub_ctx has an ->is_dev_replace member, so there's no point in
passing around is_dev_replace where sctx is available.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116b..4bcc275f76128 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3022,8 +3022,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct map_lookup *map,
 					   struct btrfs_device *scrub_dev,
-					   int num, u64 base, u64 length,
-					   int is_dev_replace)
+					   int num, u64 base, u64 length)
 {
 	struct btrfs_path *path, *ppath;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3299,7 +3298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			extent_physical = extent_logical - logical + physical;
 			extent_dev = scrub_dev;
 			extent_mirror_num = mirror_num;
-			if (is_dev_replace)
+			if (sctx->is_dev_replace)
 				scrub_remap_extent(fs_info, extent_logical,
 						   extent_len, &extent_physical,
 						   &extent_dev,
@@ -3397,8 +3396,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 					  struct btrfs_device *scrub_dev,
 					  u64 chunk_offset, u64 length,
 					  u64 dev_offset,
-					  struct btrfs_block_group_cache *cache,
-					  int is_dev_replace)
+					  struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
@@ -3435,8 +3433,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
 			ret = scrub_stripe(sctx, map, scrub_dev, i,
-					   chunk_offset, length,
-					   is_dev_replace);
+					   chunk_offset, length);
 			if (ret)
 				goto out;
 		}
@@ -3449,8 +3446,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 
 static noinline_for_stack
 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
-			   struct btrfs_device *scrub_dev, u64 start, u64 end,
-			   int is_dev_replace)
+			   struct btrfs_device *scrub_dev, u64 start, u64 end)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
@@ -3544,7 +3540,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		 */
 		scrub_pause_on(fs_info);
 		ret = btrfs_inc_block_group_ro(cache);
-		if (!ret && is_dev_replace) {
+		if (!ret && sctx->is_dev_replace) {
 			/*
 			 * If we are doing a device replace wait for any tasks
 			 * that started dellaloc right before we set the block
@@ -3609,7 +3605,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		dev_replace->item_needs_writeback = 1;
 		btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
 		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
-				  found_key.offset, cache, is_dev_replace);
+				  found_key.offset, cache);
 
 		/*
 		 * flush, submit all pending read and write bios, afterwards
@@ -3670,7 +3666,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
-		if (is_dev_replace &&
+		if (sctx->is_dev_replace &&
 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
 			ret = -EIO;
 			break;
@@ -3893,8 +3889,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 
 	if (!ret)
-		ret = scrub_enumerate_chunks(sctx, dev, start, end,
-					     is_dev_replace);
+		ret = scrub_enumerate_chunks(sctx, dev, start, end);
 
 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 037/167] Btrfs: fix deadlock with memory reclaim during scrub
       [not found] <20190903162519.7136-1-sashal@kernel.org>
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 036/167] Btrfs: clean up scrub is_dev_replace parameter Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 038/167] btrfs: Remove extent_io_ops::fill_delalloc Sasha Levin
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Filipe Manana, Nikolay Borisov, David Sterba, Sasha Levin, linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

[ Upstream commit a5fb11429167ee6ddeeacc554efaf5776b36433a ]

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.

Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().

We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).

So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
CC: stable@vger.kernel.org # 4.6+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4bcc275f76128..5a2d10ba747f7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
 	struct rb_node *parent = NULL;
 	struct full_stripe_lock *entry;
 	struct full_stripe_lock *ret;
+	unsigned int nofs_flag;
 
 	lockdep_assert_held(&locks_root->lock);
 
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
 		}
 	}
 
-	/* Insert new lock */
+	/*
+	 * Insert new lock.
+	 *
+	 * We must use GFP_NOFS because the scrub task might be waiting for a
+	 * worker task executing this function and in turn a transaction commit
+	 * might be waiting the scrub task to pause (which needs to wait for all
+	 * the worker tasks to complete before pausing).
+	 */
+	nofs_flag = memalloc_nofs_save();
 	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+	memalloc_nofs_restore(nofs_flag);
 	if (!ret)
 		return ERR_PTR(-ENOMEM);
 	ret->logical = fstripe_logical;
@@ -1622,8 +1632,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 	mutex_lock(&sctx->wr_lock);
 again:
 	if (!sctx->wr_curr_bio) {
+		unsigned int nofs_flag;
+
+		/*
+		 * We must use GFP_NOFS because the scrub task might be waiting
+		 * for a worker task executing this function and in turn a
+		 * transaction commit might be waiting the scrub task to pause
+		 * (which needs to wait for all the worker tasks to complete
+		 * before pausing).
+		 */
+		nofs_flag = memalloc_nofs_save();
 		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
 					      GFP_KERNEL);
+		memalloc_nofs_restore(nofs_flag);
 		if (!sctx->wr_curr_bio) {
 			mutex_unlock(&sctx->wr_lock);
 			return -ENOMEM;
@@ -3775,6 +3796,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	struct scrub_ctx *sctx;
 	int ret;
 	struct btrfs_device *dev;
+	unsigned int nofs_flag;
 
 	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
@@ -3878,6 +3900,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
 
+	/*
+	 * In order to avoid deadlock with reclaim when there is a transaction
+	 * trying to pause scrub, make sure we use GFP_NOFS for all the
+	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+	 * invoked by our callees. The pausing request is done when the
+	 * transaction commit starts, and it blocks the transaction until scrub
+	 * is paused (done at specific points at scrub_stripe() or right above
+	 * before incrementing fs_info->scrubs_running).
+	 */
+	nofs_flag = memalloc_nofs_save();
 	if (!is_dev_replace) {
 		/*
 		 * by holding device list mutex, we can
@@ -3890,6 +3922,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	if (!ret)
 		ret = scrub_enumerate_chunks(sctx, dev, start, end);
+	memalloc_nofs_restore(nofs_flag);
 
 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 038/167] btrfs: Remove extent_io_ops::fill_delalloc
       [not found] <20190903162519.7136-1-sashal@kernel.org>
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 036/167] Btrfs: clean up scrub is_dev_replace parameter Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 037/167] Btrfs: fix deadlock with memory reclaim during scrub Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 039/167] btrfs: Fix error handling in btrfs_cleanup_ordered_extents Sasha Levin
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nikolay Borisov, Josef Bacik, David Sterba, Sasha Levin, linux-btrfs

From: Nikolay Borisov <nborisov@suse.com>

[ Upstream commit 5eaad97af8aeff38debe7d3c69ec3a0d71f8350f ]

This callback is called only from writepage_delalloc which in turn is
guaranteed to be called from the data page writeout path. In the end
there is no reason to have the call to this function to be indrected via
the extent_io_ops structure. This patch removes the callback definition,
exports the function and calls it directly. No functional changes.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename to btrfs_run_delalloc_range ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/ctree.h     |  3 +++
 fs/btrfs/extent_io.c | 20 +++++++++-----------
 fs/btrfs/extent_io.h |  5 -----
 fs/btrfs/inode.c     | 15 +++++++--------
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82682da5a40dd..4644f9b629a53 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3200,6 +3200,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 				    struct btrfs_trans_handle *trans, int mode,
 				    u64 start, u64 num_bytes, u64 min_size,
 				    loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+		u64 start, u64 end, int *page_started, unsigned long *nr_written,
+		struct writeback_control *wbc);
 extern const struct dentry_operations btrfs_dentry_operations;
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_inode_set_ops(struct inode *inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 90b0a6eff5350..cb598eb4f3bd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3199,7 +3199,7 @@ static void update_nr_written(struct writeback_control *wbc,
 /*
  * helper for __extent_writepage, doing all of the delayed allocation setup.
  *
- * This returns 1 if our fill_delalloc function did all the work required
+ * This returns 1 if btrfs_run_delalloc_range function did all the work required
  * to write the page (copy into inline extent).  In this case the IO has
  * been started and the page is already unlocked.
  *
@@ -3220,7 +3220,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 	int ret;
 	int page_started = 0;
 
-	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+	if (epd->extent_locked)
 		return 0;
 
 	while (delalloc_end < page_end) {
@@ -3233,18 +3233,16 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		ret = tree->ops->fill_delalloc(inode, page,
-					       delalloc_start,
-					       delalloc_end,
-					       &page_started,
-					       nr_written, wbc);
+		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
+				delalloc_end, &page_started, nr_written, wbc);
 		/* File system has been set read-only */
 		if (ret) {
 			SetPageError(page);
-			/* fill_delalloc should be return < 0 for error
-			 * but just in case, we use > 0 here meaning the
-			 * IO is started, so we don't want to return > 0
-			 * unless things are going well.
+			/*
+			 * btrfs_run_delalloc_range should return < 0 for error
+			 * but just in case, we use > 0 here meaning the IO is
+			 * started, so we don't want to return > 0 unless
+			 * things are going well.
 			 */
 			ret = ret < 0 ? ret : -EIO;
 			goto done;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d7..ed27becd963c5 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,11 +106,6 @@ struct extent_io_ops {
 	/*
 	 * Optional hooks, called if the pointer is not NULL
 	 */
-	int (*fill_delalloc)(void *private_data, struct page *locked_page,
-			     u64 start, u64 end, int *page_started,
-			     unsigned long *nr_written,
-			     struct writeback_control *wbc);
-
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 355ff08e9d44e..bfacce295ef1e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,8 +110,8 @@ static void __endio_write_update_ordered(struct inode *inode,
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of
+ * btrfs_run_delalloc_range already does proper cleanup for the first page of
  * the range, that is, it invokes the callback writepage_end_io_hook() for the
  * range of the first page.
  */
@@ -1599,12 +1599,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
 }
 
 /*
- * extent_io.c call back to do delayed allocation processing
+ * Function to process delayed allocation (create CoW) for ranges which are
+ * being touched for the first time.
  */
-static int run_delalloc_range(void *private_data, struct page *locked_page,
-			      u64 start, u64 end, int *page_started,
-			      unsigned long *nr_written,
-			      struct writeback_control *wbc)
+int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+		u64 start, u64 end, int *page_started, unsigned long *nr_written,
+		struct writeback_control *wbc)
 {
 	struct inode *inode = private_data;
 	int ret;
@@ -10598,7 +10598,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
 
 	/* optional callbacks */
-	.fill_delalloc = run_delalloc_range,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 039/167] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 038/167] btrfs: Remove extent_io_ops::fill_delalloc Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 046/167] btrfs: volumes: Make sure no dev extent is beyond device boundary Sasha Levin
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nikolay Borisov, Josef Bacik, David Sterba, Sasha Levin, linux-btrfs

From: Nikolay Borisov <nborisov@suse.com>

[ Upstream commit d1051d6ebf8ef3517a5a3cf82bba8436d190f1c2 ]

Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:

	btrfs           D    0  5760   5324 0x00000000
	Call Trace:
	 ? __schedule+0x243/0x800
	 schedule+0x33/0x90
	 btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
	 ? wait_woken+0xa0/0xa0
	 btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
	 btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
	 btrfs_relocate_chunk+0x49/0x100 [btrfs]
	 btrfs_balance+0xbeb/0x1740 [btrfs]
	 btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
	 btrfs_ioctl+0x1691/0x3110 [btrfs]
	 ? lockdep_hardirqs_on+0xed/0x180
	 ? __handle_mm_fault+0x8e7/0xfb0
	 ? _raw_spin_unlock+0x24/0x30
	 ? __handle_mm_fault+0x8e7/0xfb0
	 ? do_vfs_ioctl+0xa5/0x6e0
	 ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
	 do_vfs_ioctl+0xa5/0x6e0
	 ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
	 ksys_ioctl+0x3a/0x70
	 __x64_sys_ioctl+0x16/0x20
	 do_syscall_64+0x60/0x1b0
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

This happens because during page writeback it's valid for
writepage_delalloc to instantiate a delalloc range which doesn't belong
to the page currently being written back.

The reason this case is valid is due to find_lock_delalloc_range
returning any available range after the passed delalloc_start and
ignoring whether the page under writeback is within that range.

In turn ordered extents (OE) are always created for the returned range
from find_lock_delalloc_range. If, however, a failure occurs while OE
are being created then the clean up code in btrfs_cleanup_ordered_extents
will be called.

Unfortunately the code in btrfs_cleanup_ordered_extents doesn't consider
the case of such 'foreign' range being processed and instead it always
assumes that the range OE are created for belongs to the page. This
leads to the first page of such foregin range to not be cleaned up since
it's deliberately missed and skipped by the current cleaning up code.

Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so adjsut the range parameters passed
for cleaning up. If it doesn't, then just clean the whole OE range
directly.

Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/inode.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bfacce295ef1e..98c535ae038da 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,17 +110,17 @@ static void __endio_write_update_ordered(struct inode *inode,
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of
- * btrfs_run_delalloc_range already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
  */
 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
-						 const u64 offset,
-						 const u64 bytes)
+						 struct page *locked_page,
+						 u64 offset, u64 bytes)
 {
 	unsigned long index = offset >> PAGE_SHIFT;
 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+	u64 page_start = page_offset(locked_page);
+	u64 page_end = page_start + PAGE_SIZE - 1;
+
 	struct page *page;
 
 	while (index <= end_index) {
@@ -131,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
 		ClearPagePrivate2(page);
 		put_page(page);
 	}
-	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
-					    bytes - PAGE_SIZE, false);
+
+	/*
+	 * In case this page belongs to the delalloc range being instantiated
+	 * then skip it, since the first page of a range is going to be
+	 * properly cleaned up by the caller of run_delalloc_range
+	 */
+	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+		offset += PAGE_SIZE;
+		bytes -= PAGE_SIZE;
+	}
+
+	return __endio_write_update_ordered(inode, offset, bytes, false);
 }
 
 static int btrfs_dirty_inode(struct inode *inode);
@@ -1629,7 +1639,8 @@ int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
 					   write_flags);
 	}
 	if (ret)
-		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+		btrfs_cleanup_ordered_extents(inode, locked_page, start,
+					      end - start + 1);
 	return ret;
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 046/167] btrfs: volumes: Make sure no dev extent is beyond device boundary
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (3 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 039/167] btrfs: Fix error handling in btrfs_cleanup_ordered_extents Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 047/167] btrfs: Use real device structure to verify dev extent Sasha Levin
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Qu Wenruo, David Sterba, Sasha Levin, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit 05a37c48604c19b50873fd9663f9140c150469d1 ]

Add extra dev extent end check against device boundary.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6e008bd5c8cd1..c20708bfae561 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7411,6 +7411,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
+	struct btrfs_device *dev;
 	u64 stripe_len;
 	bool found = false;
 	int ret = 0;
@@ -7460,6 +7461,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 			physical_offset, devid);
 		ret = -EUCLEAN;
 	}
+
+	/* Make sure no dev extent is beyond device bondary */
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	if (!dev) {
+		btrfs_err(fs_info, "failed to find devid %llu", devid);
+		ret = -EUCLEAN;
+		goto out;
+	}
+	if (physical_offset + physical_len > dev->disk_total_bytes) {
+		btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+			  devid, physical_offset, physical_len,
+			  dev->disk_total_bytes);
+		ret = -EUCLEAN;
+		goto out;
+	}
 out:
 	free_extent_map(em);
 	return ret;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 047/167] btrfs: Use real device structure to verify dev extent
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (4 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 046/167] btrfs: volumes: Make sure no dev extent is beyond device boundary Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 077/167] btrfs: scrub: pass fs_info to scrub_setup_ctx Sasha Levin
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Qu Wenruo, Filipe Manana, David Sterba, Sasha Levin, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit 1b3922a8bc74231f9a767d1be6d9a061a4d4eeab ]

[BUG]
Linux v5.0-rc1 will fail fstests/btrfs/163 with the following kernel
message:

  BTRFS error (device dm-6): dev extent devid 1 physical offset 13631488 len 8388608 is beyond device boundary 0
  BTRFS error (device dm-6): failed to verify dev extents against chunks: -117
  BTRFS error (device dm-6): open_ctree failed

[CAUSE]
Commit cf90d884b347 ("btrfs: Introduce mount time chunk <-> dev extent
mapping check") introduced strict check on dev extents.

We use btrfs_find_device() with dev uuid and fs uuid set to NULL, and
only dependent on @devid to find the real device.

For seed devices, we call clone_fs_devices() in open_seed_devices() to
allow us search seed devices directly.

However clone_fs_devices() just populates devices with devid and dev
uuid, without populating other essential members, like disk_total_bytes.

This makes any device returned by btrfs_find_device(fs_info, devid,
NULL, NULL) is just a dummy, with 0 disk_total_bytes, and any dev
extents on the seed device will not pass the device boundary check.

[FIX]
This patch will try to verify the device returned by btrfs_find_device()
and if it's a dummy then re-search in seed devices.

Fixes: cf90d884b347 ("btrfs: Introduce mount time chunk <-> dev extent mapping check")
CC: stable@vger.kernel.org # 4.19+
Reported-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c20708bfae561..a8297e7489d98 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7469,6 +7469,18 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 		ret = -EUCLEAN;
 		goto out;
 	}
+
+	/* It's possible this device is a dummy for seed device */
+	if (dev->disk_total_bytes == 0) {
+		dev = find_device(fs_info->fs_devices->seed, devid, NULL);
+		if (!dev) {
+			btrfs_err(fs_info, "failed to find seed devid %llu",
+				  devid);
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
 	if (physical_offset + physical_len > dev->disk_total_bytes) {
 		btrfs_err(fs_info,
 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 077/167] btrfs: scrub: pass fs_info to scrub_setup_ctx
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (5 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 047/167] btrfs: Use real device structure to verify dev extent Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 078/167] btrfs: scrub: move scrub_setup_ctx allocation out of device_list_mutex Sasha Levin
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: David Sterba, Nikolay Borisov, Sasha Levin, linux-btrfs

From: David Sterba <dsterba@suse.com>

[ Upstream commit 92f7ba434f51e8e9317f1d166105889aa230abd2 ]

We can pass fs_info directly as this is the only member of btrfs_device
that's bing used inside scrub_setup_ctx.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a2d10ba747f7..efaad3e1b295a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -578,12 +578,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
 		scrub_free_ctx(sctx);
 }
 
-static noinline_for_stack
-struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+		struct btrfs_fs_info *fs_info, int is_dev_replace)
 {
 	struct scrub_ctx *sctx;
 	int		i;
-	struct btrfs_fs_info *fs_info = dev->fs_info;
 
 	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 	if (!sctx)
@@ -592,7 +591,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx->curr = -1;
-	sctx->fs_info = dev->fs_info;
+	sctx->fs_info = fs_info;
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 
@@ -3881,7 +3880,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return ret;
 	}
 
-	sctx = scrub_setup_ctx(dev, is_dev_replace);
+	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
 	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 078/167] btrfs: scrub: move scrub_setup_ctx allocation out of device_list_mutex
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (6 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 077/167] btrfs: scrub: pass fs_info to scrub_setup_ctx Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 079/167] btrfs: scrub: fix circular locking dependency warning Sasha Levin
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: David Sterba, Sasha Levin, linux-btrfs

From: David Sterba <dsterba@suse.com>

[ Upstream commit 0e94c4f45d14cf89d1f40c91b0a8517e791672a7 ]

The scrub context is allocated with GFP_KERNEL and called from
btrfs_scrub_dev under the fs_info::device_list_mutex. This is not safe
regarding reclaim that could try to flush filesystem data in order to
get the memory. And the device_list_mutex is held during superblock
commit, so this would cause a lockup.

Move the alocation and initialization before any changes that require
the mutex.

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efaad3e1b295a..56c4d2236484f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3837,13 +3837,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return -EINVAL;
 	}
 
+	/* Allocate outside of device_list_mutex */
+	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
+	if (IS_ERR(sctx))
+		return PTR_ERR(sctx);
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
 		     !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto out_free_ctx;
 	}
 
 	if (!is_dev_replace && !readonly &&
@@ -3851,7 +3856,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
 				rcu_str_deref(dev->name));
-		return -EROFS;
+		ret = -EROFS;
+		goto out_free_ctx;
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
@@ -3859,7 +3865,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -EIO;
+		ret = -EIO;
+		goto out_free_ctx;
 	}
 
 	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
@@ -3869,7 +3876,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -EINPROGRESS;
+		ret = -EINPROGRESS;
+		goto out_free_ctx;
 	}
 	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
@@ -3877,16 +3885,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	if (ret) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return ret;
+		goto out_free_ctx;
 	}
 
-	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
-	if (IS_ERR(sctx)) {
-		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
-		return PTR_ERR(sctx);
-	}
 	sctx->readonly = readonly;
 	dev->scrub_ctx = sctx;
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3939,6 +3940,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	scrub_put_ctx(sctx);
 
+	return ret;
+
+out_free_ctx:
+	scrub_free_ctx(sctx);
+
 	return ret;
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 079/167] btrfs: scrub: fix circular locking dependency warning
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (7 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 078/167] btrfs: scrub: move scrub_setup_ctx allocation out of device_list_mutex Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 080/167] btrfs: init csum_list before possible free Sasha Levin
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Anand Jain, David Sterba, Sasha Levin, linux-btrfs

From: Anand Jain <anand.jain@oracle.com>

[ Upstream commit 1cec3f27168d7835ff3d23ab371cd548440131bb ]

This fixes a longstanding lockdep warning triggered by
fstests/btrfs/011.

Circular locking dependency check reports warning[1], that's because the
btrfs_scrub_dev() calls the stack #0 below with, the fs_info::scrub_lock
held. The test case leading to this warning:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /btrfs
  $ btrfs scrub start -B /btrfs

In fact we have fs_info::scrub_workers_refcnt to track if the init and destroy
of the scrub workers are needed. So once we have incremented and decremented
the fs_info::scrub_workers_refcnt value in the thread, its ok to drop the
scrub_lock, and then actually do the btrfs_destroy_workqueue() part. So this
patch drops the scrub_lock before calling btrfs_destroy_workqueue().

  [359.258534] ======================================================
  [359.260305] WARNING: possible circular locking dependency detected
  [359.261938] 5.0.0-rc6-default #461 Not tainted
  [359.263135] ------------------------------------------------------
  [359.264672] btrfs/20975 is trying to acquire lock:
  [359.265927] 00000000d4d32bea ((wq_completion)"%s-%s""btrfs", name){+.+.}, at: flush_workqueue+0x87/0x540
  [359.268416]
  [359.268416] but task is already holding lock:
  [359.270061] 0000000053ea26a6 (&fs_info->scrub_lock){+.+.}, at: btrfs_scrub_dev+0x322/0x590 [btrfs]
  [359.272418]
  [359.272418] which lock already depends on the new lock.
  [359.272418]
  [359.274692]
  [359.274692] the existing dependency chain (in reverse order) is:
  [359.276671]
  [359.276671] -> #3 (&fs_info->scrub_lock){+.+.}:
  [359.278187]        __mutex_lock+0x86/0x9c0
  [359.279086]        btrfs_scrub_pause+0x31/0x100 [btrfs]
  [359.280421]        btrfs_commit_transaction+0x1e4/0x9e0 [btrfs]
  [359.281931]        close_ctree+0x30b/0x350 [btrfs]
  [359.283208]        generic_shutdown_super+0x64/0x100
  [359.284516]        kill_anon_super+0x14/0x30
  [359.285658]        btrfs_kill_super+0x12/0xa0 [btrfs]
  [359.286964]        deactivate_locked_super+0x29/0x60
  [359.288242]        cleanup_mnt+0x3b/0x70
  [359.289310]        task_work_run+0x98/0xc0
  [359.290428]        exit_to_usermode_loop+0x83/0x90
  [359.291445]        do_syscall_64+0x15b/0x180
  [359.292598]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [359.294011]
  [359.294011] -> #2 (sb_internal#2){.+.+}:
  [359.295432]        __sb_start_write+0x113/0x1d0
  [359.296394]        start_transaction+0x369/0x500 [btrfs]
  [359.297471]        btrfs_finish_ordered_io+0x2aa/0x7c0 [btrfs]
  [359.298629]        normal_work_helper+0xcd/0x530 [btrfs]
  [359.299698]        process_one_work+0x246/0x610
  [359.300898]        worker_thread+0x3c/0x390
  [359.302020]        kthread+0x116/0x130
  [359.303053]        ret_from_fork+0x24/0x30
  [359.304152]
  [359.304152] -> #1 ((work_completion)(&work->normal_work)){+.+.}:
  [359.306100]        process_one_work+0x21f/0x610
  [359.307302]        worker_thread+0x3c/0x390
  [359.308465]        kthread+0x116/0x130
  [359.309357]        ret_from_fork+0x24/0x30
  [359.310229]
  [359.310229] -> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
  [359.311812]        lock_acquire+0x90/0x180
  [359.312929]        flush_workqueue+0xaa/0x540
  [359.313845]        drain_workqueue+0xa1/0x180
  [359.314761]        destroy_workqueue+0x17/0x240
  [359.315754]        btrfs_destroy_workqueue+0x57/0x200 [btrfs]
  [359.317245]        scrub_workers_put+0x2c/0x60 [btrfs]
  [359.318585]        btrfs_scrub_dev+0x336/0x590 [btrfs]
  [359.319944]        btrfs_dev_replace_by_ioctl.cold.19+0x179/0x1bb [btrfs]
  [359.321622]        btrfs_ioctl+0x28a4/0x2e40 [btrfs]
  [359.322908]        do_vfs_ioctl+0xa2/0x6d0
  [359.324021]        ksys_ioctl+0x3a/0x70
  [359.325066]        __x64_sys_ioctl+0x16/0x20
  [359.326236]        do_syscall_64+0x54/0x180
  [359.327379]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [359.328772]
  [359.328772] other info that might help us debug this:
  [359.328772]
  [359.330990] Chain exists of:
  [359.330990]   (wq_completion)"%s-%s""btrfs", name --> sb_internal#2 --> &fs_info->scrub_lock
  [359.330990]
  [359.334376]  Possible unsafe locking scenario:
  [359.334376]
  [359.336020]        CPU0                    CPU1
  [359.337070]        ----                    ----
  [359.337821]   lock(&fs_info->scrub_lock);
  [359.338506]                                lock(sb_internal#2);
  [359.339506]                                lock(&fs_info->scrub_lock);
  [359.341461]   lock((wq_completion)"%s-%s""btrfs", name);
  [359.342437]
  [359.342437]  *** DEADLOCK ***
  [359.342437]
  [359.343745] 1 lock held by btrfs/20975:
  [359.344788]  #0: 0000000053ea26a6 (&fs_info->scrub_lock){+.+.}, at: btrfs_scrub_dev+0x322/0x590 [btrfs]
  [359.346778]
  [359.346778] stack backtrace:
  [359.347897] CPU: 0 PID: 20975 Comm: btrfs Not tainted 5.0.0-rc6-default #461
  [359.348983] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
  [359.350501] Call Trace:
  [359.350931]  dump_stack+0x67/0x90
  [359.351676]  print_circular_bug.isra.37.cold.56+0x15c/0x195
  [359.353569]  check_prev_add.constprop.44+0x4f9/0x750
  [359.354849]  ? check_prev_add.constprop.44+0x286/0x750
  [359.356505]  __lock_acquire+0xb84/0xf10
  [359.357505]  lock_acquire+0x90/0x180
  [359.358271]  ? flush_workqueue+0x87/0x540
  [359.359098]  flush_workqueue+0xaa/0x540
  [359.359912]  ? flush_workqueue+0x87/0x540
  [359.360740]  ? drain_workqueue+0x1e/0x180
  [359.361565]  ? drain_workqueue+0xa1/0x180
  [359.362391]  drain_workqueue+0xa1/0x180
  [359.363193]  destroy_workqueue+0x17/0x240
  [359.364539]  btrfs_destroy_workqueue+0x57/0x200 [btrfs]
  [359.365673]  scrub_workers_put+0x2c/0x60 [btrfs]
  [359.366618]  btrfs_scrub_dev+0x336/0x590 [btrfs]
  [359.367594]  ? start_transaction+0xa1/0x500 [btrfs]
  [359.368679]  btrfs_dev_replace_by_ioctl.cold.19+0x179/0x1bb [btrfs]
  [359.369545]  btrfs_ioctl+0x28a4/0x2e40 [btrfs]
  [359.370186]  ? __lock_acquire+0x263/0xf10
  [359.370777]  ? kvm_clock_read+0x14/0x30
  [359.371392]  ? kvm_sched_clock_read+0x5/0x10
  [359.372248]  ? sched_clock+0x5/0x10
  [359.372786]  ? sched_clock_cpu+0xc/0xc0
  [359.373662]  ? do_vfs_ioctl+0xa2/0x6d0
  [359.374552]  do_vfs_ioctl+0xa2/0x6d0
  [359.375378]  ? do_sigaction+0xff/0x250
  [359.376233]  ksys_ioctl+0x3a/0x70
  [359.376954]  __x64_sys_ioctl+0x16/0x20
  [359.377772]  do_syscall_64+0x54/0x180
  [359.378841]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [359.380422] RIP: 0033:0x7f5429296a97

Backporting to older kernels: scrub_nocow_workers must be freed the same
way as the others.

CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ update changelog ]
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 56c4d2236484f..a08a4d6f540f9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3778,16 +3778,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	return -ENOMEM;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
-	if (--fs_info->scrub_workers_refcnt == 0) {
-		btrfs_destroy_workqueue(fs_info->scrub_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
-	}
-	WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace)
@@ -3796,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	int ret;
 	struct btrfs_device *dev;
 	unsigned int nofs_flag;
+	struct btrfs_workqueue *scrub_workers = NULL;
+	struct btrfs_workqueue *scrub_wr_comp = NULL;
+	struct btrfs_workqueue *scrub_parity = NULL;
 
 	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
@@ -3935,9 +3928,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_ctx = NULL;
-	scrub_workers_put(fs_info);
+	if (--fs_info->scrub_workers_refcnt == 0) {
+		scrub_workers = fs_info->scrub_workers;
+		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+		scrub_parity = fs_info->scrub_parity_workers;
+	}
 	mutex_unlock(&fs_info->scrub_lock);
 
+	btrfs_destroy_workqueue(scrub_workers);
+	btrfs_destroy_workqueue(scrub_wr_comp);
+	btrfs_destroy_workqueue(scrub_parity);
 	scrub_put_ctx(sctx);
 
 	return ret;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 080/167] btrfs: init csum_list before possible free
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (8 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 079/167] btrfs: scrub: fix circular locking dependency warning Sasha Levin
@ 2019-09-03 16:23 ` Sasha Levin
  2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 118/167] Btrfs: fix race between block group removal and block group allocation Sasha Levin
  2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 141/167] btrfs: correctly validate compression type Sasha Levin
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:23 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Dan Robertson, Nikolay Borisov, David Sterba, Sasha Levin, linux-btrfs

From: Dan Robertson <dan@dlrobertson.com>

[ Upstream commit e49be14b8d80e23bb7c53d78c21717a474ade76b ]

The scrub_ctx csum_list member must be initialized before scrub_free_ctx
is called. If the csum_list is not initialized beforehand, the
list_empty call in scrub_free_csums will result in a null deref if the
allocation fails in the for loop.

Fixes: a2de733c78fa ("btrfs: scrub")
CC: stable@vger.kernel.org # 3.0+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/scrub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a08a4d6f540f9..916c397704679 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -592,6 +592,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx->curr = -1;
 	sctx->fs_info = fs_info;
+	INIT_LIST_HEAD(&sctx->csum_list);
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 
@@ -616,7 +617,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	atomic_set(&sctx->workers_pending, 0);
 	atomic_set(&sctx->cancel_req, 0);
 	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	INIT_LIST_HEAD(&sctx->csum_list);
 
 	spin_lock_init(&sctx->list_lock);
 	spin_lock_init(&sctx->stat_lock);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 118/167] Btrfs: fix race between block group removal and block group allocation
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (9 preceding siblings ...)
  2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 080/167] btrfs: init csum_list before possible free Sasha Levin
@ 2019-09-03 16:24 ` Sasha Levin
  2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 141/167] btrfs: correctly validate compression type Sasha Levin
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:24 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Filipe Manana, David Sterba, Sasha Levin, linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

[ Upstream commit 8eaf40c0e24e98899a0f3ac9d25a33aafe13822a ]

If a task is removing the block group that currently has the highest start
offset amongst all existing block groups, there is a short time window
where it races with a concurrent block group allocation, resulting in a
transaction abort with an error code of EEXIST.

The following diagram explains the race in detail:

      Task A                                                        Task B

 btrfs_remove_block_group(bg offset X)

   remove_extent_mapping(em offset X)
     -> removes extent map X from the
        tree of extent maps
        (fs_info->mapping_tree), so the
        next call to find_next_chunk()
        will return offset X

                                                   btrfs_alloc_chunk()
                                                     find_next_chunk()
                                                       --> returns offset X

                                                     __btrfs_alloc_chunk(offset X)
                                                       btrfs_make_block_group()
                                                         btrfs_create_block_group_cache()
                                                           --> creates btrfs_block_group_cache
                                                               object with a key corresponding
                                                               to the block group item in the
                                                               extent, the key is:
                                                               (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)

                                                         --> adds the btrfs_block_group_cache object
                                                             to the list new_bgs of the transaction
                                                             handle

                                                   btrfs_end_transaction(trans handle)
                                                     __btrfs_end_transaction()
                                                       btrfs_create_pending_block_groups()
                                                         --> sees the new btrfs_block_group_cache
                                                             in the new_bgs list of the transaction
                                                             handle
                                                         --> its call to btrfs_insert_item() fails
                                                             with -EEXIST when attempting to insert
                                                             the block group item key
                                                             (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
                                                             because task A has not removed that key yet
                                                         --> aborts the running transaction with
                                                             error -EEXIST

   btrfs_del_item()
     -> removes the block group's key from
        the extent tree, key is
        (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)

A sample transaction abort trace:

  [78912.403537] ------------[ cut here ]------------
  [78912.403811] BTRFS: Transaction aborted (error -17)
  [78912.404082] WARNING: CPU: 2 PID: 20465 at fs/btrfs/extent-tree.c:10551 btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
  (...)
  [78912.405642] CPU: 2 PID: 20465 Comm: btrfs Tainted: G        W         5.0.0-btrfs-next-46 #1
  [78912.405941] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
  [78912.406586] RIP: 0010:btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
  (...)
  [78912.407636] RSP: 0018:ffff9d3d4b7e3b08 EFLAGS: 00010282
  [78912.407997] RAX: 0000000000000000 RBX: ffff90959a3796f0 RCX: 0000000000000006
  [78912.408369] RDX: 0000000000000007 RSI: 0000000000000001 RDI: ffff909636b16860
  [78912.408746] RBP: ffff909626758a58 R08: 0000000000000000 R09: 0000000000000000
  [78912.409144] R10: ffff9095ff462400 R11: 0000000000000000 R12: ffff90959a379588
  [78912.409521] R13: ffff909626758ab0 R14: ffff9095036c0000 R15: ffff9095299e1158
  [78912.409899] FS:  00007f387f16f700(0000) GS:ffff909636b00000(0000) knlGS:0000000000000000
  [78912.410285] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [78912.410673] CR2: 00007f429fc87cbc CR3: 000000014440a004 CR4: 00000000003606e0
  [78912.411095] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [78912.411496] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [78912.411898] Call Trace:
  [78912.412318]  __btrfs_end_transaction+0x5b/0x1c0 [btrfs]
  [78912.412746]  btrfs_inc_block_group_ro+0xcf/0x160 [btrfs]
  [78912.413179]  scrub_enumerate_chunks+0x188/0x5b0 [btrfs]
  [78912.413622]  ? __mutex_unlock_slowpath+0x100/0x2a0
  [78912.414078]  btrfs_scrub_dev+0x2ef/0x720 [btrfs]
  [78912.414535]  ? __sb_start_write+0xd4/0x1c0
  [78912.414963]  ? mnt_want_write_file+0x24/0x50
  [78912.415403]  btrfs_ioctl+0x17fb/0x3120 [btrfs]
  [78912.415832]  ? lock_acquire+0xa6/0x190
  [78912.416256]  ? do_vfs_ioctl+0xa2/0x6f0
  [78912.416685]  ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
  [78912.417116]  do_vfs_ioctl+0xa2/0x6f0
  [78912.417534]  ? __fget+0x113/0x200
  [78912.417954]  ksys_ioctl+0x70/0x80
  [78912.418369]  __x64_sys_ioctl+0x16/0x20
  [78912.418812]  do_syscall_64+0x60/0x1b0
  [78912.419231]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [78912.419644] RIP: 0033:0x7f3880252dd7
  (...)
  [78912.420957] RSP: 002b:00007f387f16ed68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  [78912.421426] RAX: ffffffffffffffda RBX: 000055f5becc1df0 RCX: 00007f3880252dd7
  [78912.421889] RDX: 000055f5becc1df0 RSI: 00000000c400941b RDI: 0000000000000003
  [78912.422354] RBP: 0000000000000000 R08: 00007f387f16f700 R09: 0000000000000000
  [78912.422790] R10: 00007f387f16f700 R11: 0000000000000246 R12: 0000000000000000
  [78912.423202] R13: 00007ffda49c266f R14: 0000000000000000 R15: 00007f388145e040
  [78912.425505] ---[ end trace eb9bfe7c426fc4d3 ]---

Fix this by calling remove_extent_mapping(), at btrfs_remove_block_group(),
only at the very end, after removing the block group item key from the
extent tree (and removing the free space tree entry if we are using the
free space tree feature).

Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/extent-tree.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0cc800d22a081..88c939f7aad96 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10478,22 +10478,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&block_group->lock);
 
-	if (remove_em) {
-		struct extent_map_tree *em_tree;
-
-		em_tree = &fs_info->mapping_tree.map_tree;
-		write_lock(&em_tree->lock);
-		/*
-		 * The em might be in the pending_chunks list, so make sure the
-		 * chunk mutex is locked, since remove_extent_mapping() will
-		 * delete us from that list.
-		 */
-		remove_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		/* once for the tree */
-		free_extent_map(em);
-	}
-
 	mutex_unlock(&fs_info->chunk_mutex);
 
 	ret = remove_block_group_free_space(trans, block_group);
@@ -10510,6 +10494,24 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 
 	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	if (remove_em) {
+		struct extent_map_tree *em_tree;
+
+		em_tree = &fs_info->mapping_tree.map_tree;
+		write_lock(&em_tree->lock);
+		/*
+		 * The em might be in the pending_chunks list, so make sure the
+		 * chunk mutex is locked, since remove_extent_mapping() will
+		 * delete us from that list.
+		 */
+		remove_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		/* once for the tree */
+		free_extent_map(em);
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH AUTOSEL 4.19 141/167] btrfs: correctly validate compression type
       [not found] <20190903162519.7136-1-sashal@kernel.org>
                   ` (10 preceding siblings ...)
  2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 118/167] Btrfs: fix race between block group removal and block group allocation Sasha Levin
@ 2019-09-03 16:24 ` Sasha Levin
  11 siblings, 0 replies; 12+ messages in thread
From: Sasha Levin @ 2019-09-03 16:24 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Johannes Thumshirn, Nikolay Borisov, David Sterba, Sasha Levin,
	linux-btrfs

From: Johannes Thumshirn <jthumshirn@suse.de>

[ Upstream commit aa53e3bfac7205fb3a8815ac1c937fd6ed01b41e ]

Nikolay reported the following KASAN splat when running btrfs/048:

[ 1843.470920] ==================================================================
[ 1843.471971] BUG: KASAN: slab-out-of-bounds in strncmp+0x66/0xb0
[ 1843.472775] Read of size 1 at addr ffff888111e369e2 by task btrfs/3979

[ 1843.473904] CPU: 3 PID: 3979 Comm: btrfs Not tainted 5.2.0-rc3-default #536
[ 1843.475009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[ 1843.476322] Call Trace:
[ 1843.476674]  dump_stack+0x7c/0xbb
[ 1843.477132]  ? strncmp+0x66/0xb0
[ 1843.477587]  print_address_description+0x114/0x320
[ 1843.478256]  ? strncmp+0x66/0xb0
[ 1843.478740]  ? strncmp+0x66/0xb0
[ 1843.479185]  __kasan_report+0x14e/0x192
[ 1843.479759]  ? strncmp+0x66/0xb0
[ 1843.480209]  kasan_report+0xe/0x20
[ 1843.480679]  strncmp+0x66/0xb0
[ 1843.481105]  prop_compression_validate+0x24/0x70
[ 1843.481798]  btrfs_xattr_handler_set_prop+0x65/0x160
[ 1843.482509]  __vfs_setxattr+0x71/0x90
[ 1843.483012]  __vfs_setxattr_noperm+0x84/0x130
[ 1843.483606]  vfs_setxattr+0xac/0xb0
[ 1843.484085]  setxattr+0x18c/0x230
[ 1843.484546]  ? vfs_setxattr+0xb0/0xb0
[ 1843.485048]  ? __mod_node_page_state+0x1f/0xa0
[ 1843.485672]  ? _raw_spin_unlock+0x24/0x40
[ 1843.486233]  ? __handle_mm_fault+0x988/0x1290
[ 1843.486823]  ? lock_acquire+0xb4/0x1e0
[ 1843.487330]  ? lock_acquire+0xb4/0x1e0
[ 1843.487842]  ? mnt_want_write_file+0x3c/0x80
[ 1843.488442]  ? debug_lockdep_rcu_enabled+0x22/0x40
[ 1843.489089]  ? rcu_sync_lockdep_assert+0xe/0x70
[ 1843.489707]  ? __sb_start_write+0x158/0x200
[ 1843.490278]  ? mnt_want_write_file+0x3c/0x80
[ 1843.490855]  ? __mnt_want_write+0x98/0xe0
[ 1843.491397]  __x64_sys_fsetxattr+0xba/0xe0
[ 1843.492201]  ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 1843.493201]  do_syscall_64+0x6c/0x230
[ 1843.493988]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.495041] RIP: 0033:0x7fa7a8a7707a
[ 1843.495819] Code: 48 8b 0d 21 de 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 be 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ee dd 2b 00 f7 d8 64 89 01 48
[ 1843.499203] RSP: 002b:00007ffcb73bca38 EFLAGS: 00000202 ORIG_RAX: 00000000000000be
[ 1843.500210] RAX: ffffffffffffffda RBX: 00007ffcb73bda9d RCX: 00007fa7a8a7707a
[ 1843.501170] RDX: 00007ffcb73bda9d RSI: 00000000006dc050 RDI: 0000000000000003
[ 1843.502152] RBP: 00000000006dc050 R08: 0000000000000000 R09: 0000000000000000
[ 1843.503109] R10: 0000000000000002 R11: 0000000000000202 R12: 00007ffcb73bda91
[ 1843.504055] R13: 0000000000000003 R14: 00007ffcb73bda82 R15: ffffffffffffffff

[ 1843.505268] Allocated by task 3979:
[ 1843.505771]  save_stack+0x19/0x80
[ 1843.506211]  __kasan_kmalloc.constprop.5+0xa0/0xd0
[ 1843.506836]  setxattr+0xeb/0x230
[ 1843.507264]  __x64_sys_fsetxattr+0xba/0xe0
[ 1843.507886]  do_syscall_64+0x6c/0x230
[ 1843.508429]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[ 1843.509558] Freed by task 0:
[ 1843.510188] (stack is not available)

[ 1843.511309] The buggy address belongs to the object at ffff888111e369e0
                which belongs to the cache kmalloc-8 of size 8
[ 1843.514095] The buggy address is located 2 bytes inside of
                8-byte region [ffff888111e369e0, ffff888111e369e8)
[ 1843.516524] The buggy address belongs to the page:
[ 1843.517561] page:ffff88813f478d80 refcount:1 mapcount:0 mapping:ffff88811940c300 index:0xffff888111e373b8 compound_mapcount: 0
[ 1843.519993] flags: 0x4404000010200(slab|head)
[ 1843.520951] raw: 0004404000010200 ffff88813f48b008 ffff888119403d50 ffff88811940c300
[ 1843.522616] raw: ffff888111e373b8 000000000016000f 00000001ffffffff 0000000000000000
[ 1843.524281] page dumped because: kasan: bad access detected

[ 1843.525936] Memory state around the buggy address:
[ 1843.526975]  ffff888111e36880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.528479]  ffff888111e36900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.530138] >ffff888111e36980: fc fc fc fc fc fc fc fc fc fc fc fc 02 fc fc fc
[ 1843.531877]                                                        ^
[ 1843.533287]  ffff888111e36a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.534874]  ffff888111e36a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.536468] ==================================================================

This is caused by supplying a too short compression value ('lz') in the
test-case and comparing it to 'lzo' with strncmp() and a length of 3.
strncmp() read past the 'lz' when looking for the 'o' and thus caused an
out-of-bounds read.

Introduce a new check 'btrfs_compress_is_valid_type()' which not only
checks the user-supplied value against known compression types, but also
employs checks for too short values.

Reported-by: Nikolay Borisov <nborisov@suse.com>
Fixes: 272e5326c783 ("btrfs: prop: fix vanished compression property after failed set")
CC: stable@vger.kernel.org # 5.1+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/compression.c | 16 ++++++++++++++++
 fs/btrfs/compression.h |  1 +
 fs/btrfs/props.c       |  6 +-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7b..c71e534ca7ef6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -42,6 +42,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
 	return NULL;
 }
 
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+	int i;
+
+	for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+		size_t comp_len = strlen(btrfs_compress_types[i]);
+
+		if (len < comp_len)
+			continue;
+
+		if (!strncmp(btrfs_compress_types[i], str, comp_len))
+			return true;
+	}
+	return false;
+}
+
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf204..f97d90a1fa531 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -127,6 +127,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
 extern const struct btrfs_compress_op btrfs_zstd_compress;
 
 const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
 
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
 
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 61d22a56c0ba4..6980a0e13f18e 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -366,11 +366,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
 
 static int prop_compression_validate(const char *value, size_t len)
 {
-	if (!strncmp("lzo", value, 3))
-		return 0;
-	else if (!strncmp("zlib", value, 4))
-		return 0;
-	else if (!strncmp("zstd", value, 4))
+	if (btrfs_compress_is_valid_type(value, len))
 		return 0;
 
 	return -EINVAL;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-09-03 16:38 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20190903162519.7136-1-sashal@kernel.org>
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 036/167] Btrfs: clean up scrub is_dev_replace parameter Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 037/167] Btrfs: fix deadlock with memory reclaim during scrub Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 038/167] btrfs: Remove extent_io_ops::fill_delalloc Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 039/167] btrfs: Fix error handling in btrfs_cleanup_ordered_extents Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 046/167] btrfs: volumes: Make sure no dev extent is beyond device boundary Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 047/167] btrfs: Use real device structure to verify dev extent Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 077/167] btrfs: scrub: pass fs_info to scrub_setup_ctx Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 078/167] btrfs: scrub: move scrub_setup_ctx allocation out of device_list_mutex Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 079/167] btrfs: scrub: fix circular locking dependency warning Sasha Levin
2019-09-03 16:23 ` [PATCH AUTOSEL 4.19 080/167] btrfs: init csum_list before possible free Sasha Levin
2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 118/167] Btrfs: fix race between block group removal and block group allocation Sasha Levin
2019-09-03 16:24 ` [PATCH AUTOSEL 4.19 141/167] btrfs: correctly validate compression type Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).