[PATCH 0/4][RFC] clean up how we mark block groups read only

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4][RFC] clean up how we mark block groups read only
@ 2019-11-25 14:40 Josef Bacik
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Josef Bacik @ 2019-11-25 14:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu

Qu has been looking into ENOSPC during relocation and noticed some weirdness
with inc_block_group_ro.  The problem is this code hasn't changed to keep up
with the rest of the reservation system.  It still assumes that the amount of
space used will always be less than the total space for the space info, which
hasn't been true for years since I introduced overcommitting.  This logic is
correct for DATA, but not for METADATA or SYSTEM.

The first few patches are just cleanups, and can probably be taken as is.  The
final patch is the real meat of the problem to address Qu's issues.  This is an
RFC because I'm elbow deep in another problem and haven't tested this beyond
compile testing, but I think it'll work.  Once I've gotten a chance to test it
and Qu has tested it I'll update the list if it's good to go as is, or send a V2
with any changes.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit
  2019-11-25 14:40 [PATCH 0/4][RFC] clean up how we mark block groups read only Josef Bacik
@ 2019-11-25 14:40 ` Josef Bacik
  2019-11-25 15:36   ` Johannes Thumshirn
                     ` (2 more replies)
  2019-11-25 14:40 ` [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro Josef Bacik
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 18+ messages in thread
From: Josef Bacik @ 2019-11-25 14:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu

We have the space_info, we can just check its flags to see if it's the
system chunk space info.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/space-info.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f09aa6ee9113..df5fb68df798 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -161,8 +161,7 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 
 static int can_overcommit(struct btrfs_fs_info *fs_info,
 			  struct btrfs_space_info *space_info, u64 bytes,
-			  enum btrfs_reserve_flush_enum flush,
-			  bool system_chunk)
+			  enum btrfs_reserve_flush_enum flush)
 {
 	u64 profile;
 	u64 avail;
@@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
 		return 0;
 
-	if (system_chunk)
+	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		profile = btrfs_system_alloc_profile(fs_info);
 	else
 		profile = btrfs_metadata_alloc_profile(fs_info);
@@ -227,8 +226,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
 
 		/* Check and see if our ticket can be satisified now. */
 		if ((used + ticket->bytes <= space_info->total_bytes) ||
-		    can_overcommit(fs_info, space_info, ticket->bytes, flush,
-				   false)) {
+		    can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
 			btrfs_space_info_update_bytes_may_use(fs_info,
 							      space_info,
 							      ticket->bytes);
@@ -626,8 +624,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 
 static inline u64
 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
-				 struct btrfs_space_info *space_info,
-				 bool system_chunk)
+				 struct btrfs_space_info *space_info)
 {
 	struct reserve_ticket *ticket;
 	u64 used;
@@ -643,13 +640,13 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 
 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 	if (can_overcommit(fs_info, space_info, to_reclaim,
-			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+			   BTRFS_RESERVE_FLUSH_ALL))
 		return 0;
 
 	used = btrfs_space_info_used(space_info, true);
 
 	if (can_overcommit(fs_info, space_info, SZ_1M,
-			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+			   BTRFS_RESERVE_FLUSH_ALL))
 		expected = div_factor_fine(space_info->total_bytes, 95);
 	else
 		expected = div_factor_fine(space_info->total_bytes, 90);
@@ -665,7 +662,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 
 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 					struct btrfs_space_info *space_info,
-					u64 used, bool system_chunk)
+					u64 used)
 {
 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
 
@@ -673,8 +670,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
 		return 0;
 
-	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-					      system_chunk))
+	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
 		return 0;
 
 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -765,8 +761,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 
 	spin_lock(&space_info->lock);
-	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-						      false);
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
 	if (!to_reclaim) {
 		space_info->flush = 0;
 		spin_unlock(&space_info->lock);
@@ -785,8 +780,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 			return;
 		}
 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
-							      space_info,
-							      false);
+							      space_info);
 		if (last_tickets_id == space_info->tickets_id) {
 			flush_state++;
 		} else {
@@ -858,8 +852,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 	int flush_state;
 
 	spin_lock(&space_info->lock);
-	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-						      false);
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
 	if (!to_reclaim) {
 		spin_unlock(&space_info->lock);
 		return;
@@ -990,8 +983,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 				    struct btrfs_space_info *space_info,
 				    u64 orig_bytes,
-				    enum btrfs_reserve_flush_enum flush,
-				    bool system_chunk)
+				    enum btrfs_reserve_flush_enum flush)
 {
 	struct reserve_ticket ticket;
 	u64 used;
@@ -1013,8 +1005,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	 */
 	if (!pending_tickets &&
 	    ((used + orig_bytes <= space_info->total_bytes) ||
-	     can_overcommit(fs_info, space_info, orig_bytes, flush,
-			   system_chunk))) {
+	     can_overcommit(fs_info, space_info, orig_bytes, flush))) {
 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
 						      orig_bytes);
 		ret = 0;
@@ -1054,8 +1045,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 		 * the async reclaim as we will panic.
 		 */
 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
-		    need_do_async_reclaim(fs_info, space_info,
-					  used, system_chunk) &&
+		    need_do_async_reclaim(fs_info, space_info, used) &&
 		    !work_busy(&fs_info->async_reclaim_work)) {
 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
 						  orig_bytes, flush, "preempt");
@@ -1092,10 +1082,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	int ret;
-	bool system_chunk = (root == fs_info->chunk_root);
 
 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
-				       orig_bytes, flush, system_chunk);
+				       orig_bytes, flush);
 	if (ret == -ENOSPC &&
 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
 		if (block_rsv != global_rsv &&
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro
  2019-11-25 14:40 [PATCH 0/4][RFC] clean up how we mark block groups read only Josef Bacik
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
@ 2019-11-25 14:40 ` Josef Bacik
  2019-11-26  2:35   ` Qu Wenruo
  2019-11-25 14:40 ` [PATCH 3/4] btrfs: fix force usage " Josef Bacik
  2019-11-25 14:40 ` [PATCH 4/4] btrfs: use btrfs_can_overcommit " Josef Bacik
  3 siblings, 1 reply; 18+ messages in thread
From: Josef Bacik @ 2019-11-25 14:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu

This is a relic from a time before we had a proper reservation mechanism
and you could end up with really full chunks at chunk allocation time.
This doesn't make sense anymore, so just kill it.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/block-group.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6934a5b8708f..db539bfc5a52 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1185,21 +1185,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
 	u64 sinfo_used;
-	u64 min_allocable_bytes;
 	int ret = -ENOSPC;
 
-	/*
-	 * We need some metadata space and system metadata space for
-	 * allocating chunks in some corner cases until we force to set
-	 * it to be readonly.
-	 */
-	if ((sinfo->flags &
-	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
-	    !force)
-		min_allocable_bytes = SZ_1M;
-	else
-		min_allocable_bytes = 0;
-
 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);
 
@@ -1217,10 +1204,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
 	 *
 	 * Here we make sure if we mark this bg RO, we still have enough
-	 * free space as buffer (if min_allocable_bytes is not 0).
+	 * free space as buffer.
 	 */
-	if (sinfo_used + num_bytes + min_allocable_bytes <=
-	    sinfo->total_bytes) {
+	if (sinfo_used + num_bytes + sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
 		cache->ro++;
 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -1233,8 +1219,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 		btrfs_info(cache->fs_info,
 			"unable to make block group %llu ro", cache->start);
 		btrfs_info(cache->fs_info,
-			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
-			sinfo_used, num_bytes, min_allocable_bytes);
+			"sinfo_used=%llu bg_num_bytes=%llu",
+			sinfo_used, num_bytes);
 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
 	}
 	return ret;
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-25 14:40 [PATCH 0/4][RFC] clean up how we mark block groups read only Josef Bacik
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
  2019-11-25 14:40 ` [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro Josef Bacik
@ 2019-11-25 14:40 ` Josef Bacik
  2019-11-26  2:43   ` Qu Wenruo
  2019-11-26 10:09   ` Nikolay Borisov
  2019-11-25 14:40 ` [PATCH 4/4] btrfs: use btrfs_can_overcommit " Josef Bacik
  3 siblings, 2 replies; 18+ messages in thread
From: Josef Bacik @ 2019-11-25 14:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu

For some reason we've translated the do_chunk_alloc that goes into
btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
two different things.

force for inc_block_group_ro is used when we are forcing the block group
read only no matter what, for example when the underlying chunk is
marked read only.  We need to not do the space check here as this block
group needs to be read only.

btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
we need to pre-allocate a chunk before marking the block group read
only.  This has nothing to do with forcing, and in fact we _always_ want
to do the space check in this case, so unconditionally pass false for
force in this case.

Then fixup inc_block_group_ro to honor force as it's expected and
documented to do.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/block-group.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index db539bfc5a52..3ffbc2e0af21 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1190,8 +1190,10 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);
 
-	if (cache->ro) {
+	if (cache->ro || force) {
 		cache->ro++;
+		if (list_empty(&cache->ro_list))
+			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 		ret = 0;
 		goto out;
 	}
@@ -2063,7 +2065,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 		}
 	}
 
-	ret = inc_block_group_ro(cache, !do_chunk_alloc);
+	ret = inc_block_group_ro(cache, false);
 	if (!do_chunk_alloc)
 		goto unlock_out;
 	if (!ret)
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] btrfs: use btrfs_can_overcommit in inc_block_group_ro
  2019-11-25 14:40 [PATCH 0/4][RFC] clean up how we mark block groups read only Josef Bacik
                   ` (2 preceding siblings ...)
  2019-11-25 14:40 ` [PATCH 3/4] btrfs: fix force usage " Josef Bacik
@ 2019-11-25 14:40 ` Josef Bacik
  2019-11-26  3:00   ` Qu Wenruo
  2019-11-26 10:18   ` Nikolay Borisov
  3 siblings, 2 replies; 18+ messages in thread
From: Josef Bacik @ 2019-11-25 14:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu

inc_block_group_ro does a calculation to see if we have enough room left
over if we mark this block group as read only in order to see if it's ok
to mark the block group as read only.

The problem is this calculation _only_ works for data, where our used is
always less than our total.  For metadata we will overcommit, so this
will almost always fail for metadata.

Fix this by exporting btrfs_can_overcommit, and then see if we have
enough space to remove the remaining free space in the block group we
are trying to mark read only.  If we do then we can mark this block
group as read only.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/block-group.c | 37 ++++++++++++++++++++++++++-----------
 fs/btrfs/space-info.c  | 19 ++++++++++---------
 fs/btrfs/space-info.h  |  3 +++
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3ffbc2e0af21..7b1f6d2b9621 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1184,7 +1184,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
-	u64 sinfo_used;
 	int ret = -ENOSPC;
 
 	spin_lock(&sinfo->lock);
@@ -1200,19 +1199,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 
 	num_bytes = cache->length - cache->reserved - cache->pinned -
 		    cache->bytes_super - cache->used;
-	sinfo_used = btrfs_space_info_used(sinfo, true);
 
 	/*
-	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
-	 *
-	 * Here we make sure if we mark this bg RO, we still have enough
-	 * free space as buffer.
+	 * Data never overcommits, even in mixed mode, so do just the straight
+	 * check of left over space in how much we have allocated.
 	 */
-	if (sinfo_used + num_bytes + sinfo->total_bytes) {
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
+		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
+
+		/*
+		 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
+		 *
+		 * Here we make sure if we mark this bg RO, we still have enough
+		 * free space as buffer.
+		 */
+		if (sinfo_used + num_bytes + sinfo->total_bytes)
+			ret = 0;
+	} else {
+		/*
+		 * We overcommit metadata, so we need to do the
+		 * btrfs_can_overcommit check here, and we need to pass in
+		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
+		 * leeway to allow us to mark this block group as read only.
+		 */
+		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
+					 BTRFS_RESERVE_NO_FLUSH))
+			ret = 0;
+	}
+
+	if (!ret) {
 		sinfo->bytes_readonly += num_bytes;
 		cache->ro++;
 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
-		ret = 0;
 	}
 out:
 	spin_unlock(&cache->lock);
@@ -1220,9 +1238,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
 		btrfs_info(cache->fs_info,
 			"unable to make block group %llu ro", cache->start);
-		btrfs_info(cache->fs_info,
-			"sinfo_used=%llu bg_num_bytes=%llu",
-			sinfo_used, num_bytes);
 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
 	}
 	return ret;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index df5fb68df798..01297c5b2666 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 	return (global->size << 1);
 }
 
-static int can_overcommit(struct btrfs_fs_info *fs_info,
-			  struct btrfs_space_info *space_info, u64 bytes,
-			  enum btrfs_reserve_flush_enum flush)
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+			 struct btrfs_space_info *space_info, u64 bytes,
+			 enum btrfs_reserve_flush_enum flush)
 {
 	u64 profile;
 	u64 avail;
@@ -226,7 +226,8 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
 
 		/* Check and see if our ticket can be satisified now. */
 		if ((used + ticket->bytes <= space_info->total_bytes) ||
-		    can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
+		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
+					 flush)) {
 			btrfs_space_info_update_bytes_may_use(fs_info,
 							      space_info,
 							      ticket->bytes);
@@ -639,14 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 		return to_reclaim;
 
 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
-	if (can_overcommit(fs_info, space_info, to_reclaim,
-			   BTRFS_RESERVE_FLUSH_ALL))
+	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
+				 BTRFS_RESERVE_FLUSH_ALL))
 		return 0;
 
 	used = btrfs_space_info_used(space_info, true);
 
-	if (can_overcommit(fs_info, space_info, SZ_1M,
-			   BTRFS_RESERVE_FLUSH_ALL))
+	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
+				 BTRFS_RESERVE_FLUSH_ALL))
 		expected = div_factor_fine(space_info->total_bytes, 95);
 	else
 		expected = div_factor_fine(space_info->total_bytes, 90);
@@ -1005,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	 */
 	if (!pending_tickets &&
 	    ((used + orig_bytes <= space_info->total_bytes) ||
-	     can_overcommit(fs_info, space_info, orig_bytes, flush))) {
+	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
 						      orig_bytes);
 		ret = 0;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 1a349e3f9cc1..24514cd2c6c1 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
 				 enum btrfs_reserve_flush_enum flush);
 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
 				struct btrfs_space_info *space_info);
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+			 struct btrfs_space_info *space_info, u64 bytes,
+			 enum btrfs_reserve_flush_enum flush);
 
 static inline void btrfs_space_info_free_bytes_may_use(
 				struct btrfs_fs_info *fs_info,
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
@ 2019-11-25 15:36   ` Johannes Thumshirn
  2019-11-26  2:32   ` Qu Wenruo
  2019-11-26  8:42   ` Nikolay Borisov
  2 siblings, 0 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2019-11-25 15:36 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu

Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
-- 
Johannes Thumshirn                            SUSE Labs Filesystems
jthumshirn@suse.de                                +49 911 74053 689
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5
90409 Nürnberg
Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
  2019-11-25 15:36   ` Johannes Thumshirn
@ 2019-11-26  2:32   ` Qu Wenruo
  2019-11-26  8:42   ` Nikolay Borisov
  2 siblings, 0 replies; 18+ messages in thread
From: Qu Wenruo @ 2019-11-26  2:32 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu


[-- Attachment #1.1: Type: text/plain, Size: 6792 bytes --]



On 2019/11/25 下午10:40, Josef Bacik wrote:
> We have the space_info, we can just check its flags to see if it's the
> system chunk space info.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> ---
>  fs/btrfs/space-info.c | 41 +++++++++++++++--------------------------
>  1 file changed, 15 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index f09aa6ee9113..df5fb68df798 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -161,8 +161,7 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
>  
>  static int can_overcommit(struct btrfs_fs_info *fs_info,
>  			  struct btrfs_space_info *space_info, u64 bytes,
> -			  enum btrfs_reserve_flush_enum flush,
> -			  bool system_chunk)
> +			  enum btrfs_reserve_flush_enum flush)
>  {
>  	u64 profile;
>  	u64 avail;
> @@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
>  	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
>  		return 0;
>  
> -	if (system_chunk)
> +	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
>  		profile = btrfs_system_alloc_profile(fs_info);
>  	else
>  		profile = btrfs_metadata_alloc_profile(fs_info);
> @@ -227,8 +226,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
>  
>  		/* Check and see if our ticket can be satisified now. */
>  		if ((used + ticket->bytes <= space_info->total_bytes) ||
> -		    can_overcommit(fs_info, space_info, ticket->bytes, flush,
> -				   false)) {
> +		    can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
>  			btrfs_space_info_update_bytes_may_use(fs_info,
>  							      space_info,
>  							      ticket->bytes);
> @@ -626,8 +624,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
>  
>  static inline u64
>  btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
> -				 struct btrfs_space_info *space_info,
> -				 bool system_chunk)
> +				 struct btrfs_space_info *space_info)
>  {
>  	struct reserve_ticket *ticket;
>  	u64 used;
> @@ -643,13 +640,13 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
>  
>  	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
>  	if (can_overcommit(fs_info, space_info, to_reclaim,
> -			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
> +			   BTRFS_RESERVE_FLUSH_ALL))
>  		return 0;
>  
>  	used = btrfs_space_info_used(space_info, true);
>  
>  	if (can_overcommit(fs_info, space_info, SZ_1M,
> -			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
> +			   BTRFS_RESERVE_FLUSH_ALL))
>  		expected = div_factor_fine(space_info->total_bytes, 95);
>  	else
>  		expected = div_factor_fine(space_info->total_bytes, 90);
> @@ -665,7 +662,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
>  
>  static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
>  					struct btrfs_space_info *space_info,
> -					u64 used, bool system_chunk)
> +					u64 used)
>  {
>  	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
>  
> @@ -673,8 +670,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
>  	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
>  		return 0;
>  
> -	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
> -					      system_chunk))
> +	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
>  		return 0;
>  
>  	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
> @@ -765,8 +761,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>  
>  	spin_lock(&space_info->lock);
> -	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
> -						      false);
> +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
>  	if (!to_reclaim) {
>  		space_info->flush = 0;
>  		spin_unlock(&space_info->lock);
> @@ -785,8 +780,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  			return;
>  		}
>  		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
> -							      space_info,
> -							      false);
> +							      space_info);
>  		if (last_tickets_id == space_info->tickets_id) {
>  			flush_state++;
>  		} else {
> @@ -858,8 +852,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
>  	int flush_state;
>  
>  	spin_lock(&space_info->lock);
> -	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
> -						      false);
> +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
>  	if (!to_reclaim) {
>  		spin_unlock(&space_info->lock);
>  		return;
> @@ -990,8 +983,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
>  static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  				    struct btrfs_space_info *space_info,
>  				    u64 orig_bytes,
> -				    enum btrfs_reserve_flush_enum flush,
> -				    bool system_chunk)
> +				    enum btrfs_reserve_flush_enum flush)
>  {
>  	struct reserve_ticket ticket;
>  	u64 used;
> @@ -1013,8 +1005,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  	 */
>  	if (!pending_tickets &&
>  	    ((used + orig_bytes <= space_info->total_bytes) ||
> -	     can_overcommit(fs_info, space_info, orig_bytes, flush,
> -			   system_chunk))) {
> +	     can_overcommit(fs_info, space_info, orig_bytes, flush))) {
>  		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
>  						      orig_bytes);
>  		ret = 0;
> @@ -1054,8 +1045,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  		 * the async reclaim as we will panic.
>  		 */
>  		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
> -		    need_do_async_reclaim(fs_info, space_info,
> -					  used, system_chunk) &&
> +		    need_do_async_reclaim(fs_info, space_info, used) &&
>  		    !work_busy(&fs_info->async_reclaim_work)) {
>  			trace_btrfs_trigger_flush(fs_info, space_info->flags,
>  						  orig_bytes, flush, "preempt");
> @@ -1092,10 +1082,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
>  	int ret;
> -	bool system_chunk = (root == fs_info->chunk_root);
>  
>  	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
> -				       orig_bytes, flush, system_chunk);
> +				       orig_bytes, flush);
>  	if (ret == -ENOSPC &&
>  	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
>  		if (block_rsv != global_rsv &&
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro
  2019-11-25 14:40 ` [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro Josef Bacik
@ 2019-11-26  2:35   ` Qu Wenruo
  0 siblings, 0 replies; 18+ messages in thread
From: Qu Wenruo @ 2019-11-26  2:35 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu


[-- Attachment #1.1: Type: text/plain, Size: 2651 bytes --]



On 2019/11/25 下午10:40, Josef Bacik wrote:
> This is a relic from a time before we had a proper reservation mechanism
> and you could end up with really full chunks at chunk allocation time.
> This doesn't make sense anymore, so just kill it.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/block-group.c | 22 ++++------------------
>  1 file changed, 4 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 6934a5b8708f..db539bfc5a52 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1185,21 +1185,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	struct btrfs_space_info *sinfo = cache->space_info;
>  	u64 num_bytes;
>  	u64 sinfo_used;
> -	u64 min_allocable_bytes;
>  	int ret = -ENOSPC;
>  
> -	/*
> -	 * We need some metadata space and system metadata space for
> -	 * allocating chunks in some corner cases until we force to set
> -	 * it to be readonly.
> -	 */
> -	if ((sinfo->flags &
> -	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
> -	    !force)
> -		min_allocable_bytes = SZ_1M;
> -	else
> -		min_allocable_bytes = 0;
> -
>  	spin_lock(&sinfo->lock);
>  	spin_lock(&cache->lock);
>  
> @@ -1217,10 +1204,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
>  	 *
>  	 * Here we make sure if we mark this bg RO, we still have enough
> -	 * free space as buffer (if min_allocable_bytes is not 0).
> +	 * free space as buffer.
>  	 */
> -	if (sinfo_used + num_bytes + min_allocable_bytes <=
> -	    sinfo->total_bytes) {
> +	if (sinfo_used + num_bytes + sinfo->total_bytes) {

I guess it's a typo.

It should be "if (sinfo_used + num_bytes <= sinfo->total_bytes) {"

Although the last patch will remove the check, it's still better to keep
each patch works fine to make bisect easier.

Thanks,
Qu

>  		sinfo->bytes_readonly += num_bytes;
>  		cache->ro++;
>  		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
> @@ -1233,8 +1219,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  		btrfs_info(cache->fs_info,
>  			"unable to make block group %llu ro", cache->start);
>  		btrfs_info(cache->fs_info,
> -			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
> -			sinfo_used, num_bytes, min_allocable_bytes);
> +			"sinfo_used=%llu bg_num_bytes=%llu",
> +			sinfo_used, num_bytes);
>  		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
>  	}
>  	return ret;
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-25 14:40 ` [PATCH 3/4] btrfs: fix force usage " Josef Bacik
@ 2019-11-26  2:43   ` Qu Wenruo
  2019-11-26  4:59     ` Qu Wenruo
  2019-11-26 10:09   ` Nikolay Borisov
  1 sibling, 1 reply; 18+ messages in thread
From: Qu Wenruo @ 2019-11-26  2:43 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu


[-- Attachment #1.1: Type: text/plain, Size: 2379 bytes --]



On 2019/11/25 下午10:40, Josef Bacik wrote:
> For some reason we've translated the do_chunk_alloc that goes into
> btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
> two different things.
> 
> force for inc_block_group_ro is used when we are forcing the block group
> read only no matter what, for example when the underlying chunk is
> marked read only.  We need to not do the space check here as this block
> group needs to be read only.
> 
> btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
> we need to pre-allocate a chunk before marking the block group read
> only.  This has nothing to do with forcing, and in fact we _always_ want
> to do the space check in this case, so unconditionally pass false for
> force in this case.

I think the patch order makes thing a little hard to grasp here.
Without the last patch, the idea itself is not correct.

The reason to force ro is because we want to avoid empty chunk to be
allocated, especially for scrub case.


If you put the last patch before this one, it's more clear, as then we
can accept over-commit, we won't return false ENOSPC and no empty chunk
created.

BTW, with the last patch applied, we can remove that @force parameter
for inc_block_group_ro().

Thanks,
Qu
> 
> Then fixup inc_block_group_ro to honor force as it's expected and
> documented to do.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/block-group.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index db539bfc5a52..3ffbc2e0af21 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1190,8 +1190,10 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	spin_lock(&sinfo->lock);
>  	spin_lock(&cache->lock);
>  
> -	if (cache->ro) {
> +	if (cache->ro || force) {
>  		cache->ro++;
> +		if (list_empty(&cache->ro_list))
> +			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
>  		ret = 0;
>  		goto out;
>  	}
> @@ -2063,7 +2065,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
>  		}
>  	}
>  
> -	ret = inc_block_group_ro(cache, !do_chunk_alloc);
> +	ret = inc_block_group_ro(cache, false);
>  	if (!do_chunk_alloc)
>  		goto unlock_out;
>  	if (!ret)
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] btrfs: use btrfs_can_overcommit in inc_block_group_ro
  2019-11-25 14:40 ` [PATCH 4/4] btrfs: use btrfs_can_overcommit " Josef Bacik
@ 2019-11-26  3:00   ` Qu Wenruo
  2019-11-26 16:28     ` Josef Bacik
  2019-11-26 10:18   ` Nikolay Borisov
  1 sibling, 1 reply; 18+ messages in thread
From: Qu Wenruo @ 2019-11-26  3:00 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu


[-- Attachment #1.1: Type: text/plain, Size: 7151 bytes --]



On 2019/11/25 下午10:40, Josef Bacik wrote:
> inc_block_group_ro does a calculation to see if we have enough room left
> over if we mark this block group as read only in order to see if it's ok
> to mark the block group as read only.
> 
> The problem is this calculation _only_ works for data, where our used is
> always less than our total.  For metadata we will overcommit, so this
> will almost always fail for metadata.
> 
> Fix this by exporting btrfs_can_overcommit, and then see if we have
> enough space to remove the remaining free space in the block group we
> are trying to mark read only.  If we do then we can mark this block
> group as read only.

This patch is indeed much better than my naive RFC patches.

Instead of reducing over commit threshold, this just increase the check
to can_overcommit(), brilliant.

However some small nitpicks below.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/block-group.c | 37 ++++++++++++++++++++++++++-----------
>  fs/btrfs/space-info.c  | 19 ++++++++++---------
>  fs/btrfs/space-info.h  |  3 +++
>  3 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 3ffbc2e0af21..7b1f6d2b9621 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1184,7 +1184,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
Now @force parameter is not used any more.

We can have a cleanup patch for it.

>  {
>  	struct btrfs_space_info *sinfo = cache->space_info;
>  	u64 num_bytes;
> -	u64 sinfo_used;
>  	int ret = -ENOSPC;
>  
>  	spin_lock(&sinfo->lock);
> @@ -1200,19 +1199,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  
>  	num_bytes = cache->length - cache->reserved - cache->pinned -
>  		    cache->bytes_super - cache->used;
> -	sinfo_used = btrfs_space_info_used(sinfo, true);
>  
>  	/*
> -	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
> -	 *
> -	 * Here we make sure if we mark this bg RO, we still have enough
> -	 * free space as buffer.
> +	 * Data never overcommits, even in mixed mode, so do just the straight
> +	 * check of left over space in how much we have allocated.
>  	 */
> -	if (sinfo_used + num_bytes + sinfo->total_bytes) {
> +	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
> +		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
> +
> +		/*
> +		 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
> +		 *
> +		 * Here we make sure if we mark this bg RO, we still have enough
> +		 * free space as buffer.
> +		 */
> +		if (sinfo_used + num_bytes + sinfo->total_bytes)

The same code copied from patch 3 I guess?
The same typo.

> +			ret = 0;
> +	} else {
> +		/*
> +		 * We overcommit metadata, so we need to do the
> +		 * btrfs_can_overcommit check here, and we need to pass in
> +		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
> +		 * leeway to allow us to mark this block group as read only.
> +		 */
> +		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
> +					 BTRFS_RESERVE_NO_FLUSH))
> +			ret = 0;
> +	}

For metadata chunks, allocating a new chunk won't change how we do over
commit calculation.

I guess we can skip the chunk allocation for metadata chunks in
btrfs_inc_block_group_ro()?

Thanks,
Qu

> +
> +	if (!ret) {
>  		sinfo->bytes_readonly += num_bytes;
>  		cache->ro++;
>  		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
> -		ret = 0;
>  	}
>  out:
>  	spin_unlock(&cache->lock);
> @@ -1220,9 +1238,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
>  		btrfs_info(cache->fs_info,
>  			"unable to make block group %llu ro", cache->start);
> -		btrfs_info(cache->fs_info,
> -			"sinfo_used=%llu bg_num_bytes=%llu",
> -			sinfo_used, num_bytes);
>  		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
>  	}
>  	return ret;
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index df5fb68df798..01297c5b2666 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
>  	return (global->size << 1);
>  }
>  
> -static int can_overcommit(struct btrfs_fs_info *fs_info,
> -			  struct btrfs_space_info *space_info, u64 bytes,
> -			  enum btrfs_reserve_flush_enum flush)
> +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
> +			 struct btrfs_space_info *space_info, u64 bytes,
> +			 enum btrfs_reserve_flush_enum flush)
>  {
>  	u64 profile;
>  	u64 avail;
> @@ -226,7 +226,8 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
>  
>  		/* Check and see if our ticket can be satisified now. */
>  		if ((used + ticket->bytes <= space_info->total_bytes) ||
> -		    can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
> +		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
> +					 flush)) {
>  			btrfs_space_info_update_bytes_may_use(fs_info,
>  							      space_info,
>  							      ticket->bytes);
> @@ -639,14 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
>  		return to_reclaim;
>  
>  	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
> -	if (can_overcommit(fs_info, space_info, to_reclaim,
> -			   BTRFS_RESERVE_FLUSH_ALL))
> +	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
> +				 BTRFS_RESERVE_FLUSH_ALL))
>  		return 0;
>  
>  	used = btrfs_space_info_used(space_info, true);
>  
> -	if (can_overcommit(fs_info, space_info, SZ_1M,
> -			   BTRFS_RESERVE_FLUSH_ALL))
> +	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
> +				 BTRFS_RESERVE_FLUSH_ALL))
>  		expected = div_factor_fine(space_info->total_bytes, 95);
>  	else
>  		expected = div_factor_fine(space_info->total_bytes, 90);
> @@ -1005,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  	 */
>  	if (!pending_tickets &&
>  	    ((used + orig_bytes <= space_info->total_bytes) ||
> -	     can_overcommit(fs_info, space_info, orig_bytes, flush))) {
> +	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
>  		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
>  						      orig_bytes);
>  		ret = 0;
> diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
> index 1a349e3f9cc1..24514cd2c6c1 100644
> --- a/fs/btrfs/space-info.h
> +++ b/fs/btrfs/space-info.h
> @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
>  				 enum btrfs_reserve_flush_enum flush);
>  void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
>  				struct btrfs_space_info *space_info);
> +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
> +			 struct btrfs_space_info *space_info, u64 bytes,
> +			 enum btrfs_reserve_flush_enum flush);
>  
>  static inline void btrfs_space_info_free_bytes_may_use(
>  				struct btrfs_fs_info *fs_info,
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-26  2:43   ` Qu Wenruo
@ 2019-11-26  4:59     ` Qu Wenruo
  0 siblings, 0 replies; 18+ messages in thread
From: Qu Wenruo @ 2019-11-26  4:59 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu


[-- Attachment #1.1: Type: text/plain, Size: 2624 bytes --]



On 2019/11/26 上午10:43, Qu Wenruo wrote:
> 
> 
> On 2019/11/25 下午10:40, Josef Bacik wrote:
>> For some reason we've translated the do_chunk_alloc that goes into
>> btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
>> two different things.
>>
>> force for inc_block_group_ro is used when we are forcing the block group
>> read only no matter what, for example when the underlying chunk is
>> marked read only.  We need to not do the space check here as this block
>> group needs to be read only.
>>
>> btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
>> we need to pre-allocate a chunk before marking the block group read
>> only.  This has nothing to do with forcing, and in fact we _always_ want
>> to do the space check in this case, so unconditionally pass false for
>> force in this case.
> 
> I think the patch order makes thing a little hard to grasp here.
> Without the last patch, the idea itself is not correct.
> 
> The reason to force ro is because we want to avoid empty chunk to be
> allocated, especially for scrub case.
> 
> 
> If you put the last patch before this one, it's more clear, as then we
> can accept over-commit, we won't return false ENOSPC and no empty chunk
> created.
> 
> BTW, with the last patch applied, we can remove that @force parameter
> for inc_block_group_ro().

My bad, @force parameter is still needed. Didn't notice that until all
patches applied.

Thanks,
Qu

> 
> Thanks,
> Qu
>>
>> Then fixup inc_block_group_ro to honor force as it's expected and
>> documented to do.
>>
>> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
>> ---
>>  fs/btrfs/block-group.c | 6 ++++--
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index db539bfc5a52..3ffbc2e0af21 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1190,8 +1190,10 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>>  	spin_lock(&sinfo->lock);
>>  	spin_lock(&cache->lock);
>>  
>> -	if (cache->ro) {
>> +	if (cache->ro || force) {
>>  		cache->ro++;
>> +		if (list_empty(&cache->ro_list))
>> +			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
>>  		ret = 0;
>>  		goto out;
>>  	}
>> @@ -2063,7 +2065,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
>>  		}
>>  	}
>>  
>> -	ret = inc_block_group_ro(cache, !do_chunk_alloc);
>> +	ret = inc_block_group_ro(cache, false);
>>  	if (!do_chunk_alloc)
>>  		goto unlock_out;
>>  	if (!ret)
>>
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit
  2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
  2019-11-25 15:36   ` Johannes Thumshirn
  2019-11-26  2:32   ` Qu Wenruo
@ 2019-11-26  8:42   ` Nikolay Borisov
  2 siblings, 0 replies; 18+ messages in thread
From: Nikolay Borisov @ 2019-11-26  8:42 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu



On 25.11.19 г. 16:40 ч., Josef Bacik wrote:
> We have the space_info, we can just check its flags to see if it's the
> system chunk space info.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Reviewed-by: Nikolay Borisov <nborisov@suse.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-25 14:40 ` [PATCH 3/4] btrfs: fix force usage " Josef Bacik
  2019-11-26  2:43   ` Qu Wenruo
@ 2019-11-26 10:09   ` Nikolay Borisov
  1 sibling, 0 replies; 18+ messages in thread
From: Nikolay Borisov @ 2019-11-26 10:09 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu



On 25.11.19 г. 16:40 ч., Josef Bacik wrote:
> For some reason we've translated the do_chunk_alloc that goes into
> btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
> two different things.
> 
> force for inc_block_group_ro is used when we are forcing the block group
> read only no matter what, for example when the underlying chunk is
> marked read only.  We need to not do the space check here as this block
> group needs to be read only.
> 
> btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
> we need to pre-allocate a chunk before marking the block group read
> only.  This has nothing to do with forcing, and in fact we _always_ want
> to do the space check in this case, so unconditionally pass false for
> force in this case.
> 
> Then fixup inc_block_group_ro to honor force as it's expected and
> documented to do.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Reviewed-by: Nikolay Borisov <nborisov@suse.com>

> ---
>  fs/btrfs/block-group.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index db539bfc5a52..3ffbc2e0af21 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1190,8 +1190,10 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	spin_lock(&sinfo->lock);
>  	spin_lock(&cache->lock);
>  
> -	if (cache->ro) {
> +	if (cache->ro || force) {
>  		cache->ro++;
> +		if (list_empty(&cache->ro_list))
> +			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);

nit: This only makes sense in the case of force e.g. just to make it
clearer perhahps the check can be modified to if (force || list_empty)?

>  		ret = 0;
>  		goto out;
>  	}
> @@ -2063,7 +2065,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
>  		}
>  	}
>  
> -	ret = inc_block_group_ro(cache, !do_chunk_alloc);
> +	ret = inc_block_group_ro(cache, false);
>  	if (!do_chunk_alloc)
>  		goto unlock_out;
>  	if (!ret)
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] btrfs: use btrfs_can_overcommit in inc_block_group_ro
  2019-11-25 14:40 ` [PATCH 4/4] btrfs: use btrfs_can_overcommit " Josef Bacik
  2019-11-26  3:00   ` Qu Wenruo
@ 2019-11-26 10:18   ` Nikolay Borisov
  1 sibling, 0 replies; 18+ messages in thread
From: Nikolay Borisov @ 2019-11-26 10:18 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu



On 25.11.19 г. 16:40 ч., Josef Bacik wrote:
> inc_block_group_ro does a calculation to see if we have enough room left
> over if we mark this block group as read only in order to see if it's ok
> to mark the block group as read only.
> 
> The problem is this calculation _only_ works for data, where our used is
> always less than our total.  For metadata we will overcommit, so this
> will almost always fail for metadata.
> 
> Fix this by exporting btrfs_can_overcommit, and then see if we have
> enough space to remove the remaining free space in the block group we
> are trying to mark read only.  If we do then we can mark this block
> group as read only.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/block-group.c | 37 ++++++++++++++++++++++++++-----------
>  fs/btrfs/space-info.c  | 19 ++++++++++---------
>  fs/btrfs/space-info.h  |  3 +++
>  3 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 3ffbc2e0af21..7b1f6d2b9621 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1184,7 +1184,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  {
>  	struct btrfs_space_info *sinfo = cache->space_info;
>  	u64 num_bytes;
> -	u64 sinfo_used;
>  	int ret = -ENOSPC;
>  
>  	spin_lock(&sinfo->lock);
> @@ -1200,19 +1199,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  
>  	num_bytes = cache->length - cache->reserved - cache->pinned -
>  		    cache->bytes_super - cache->used;
> -	sinfo_used = btrfs_space_info_used(sinfo, true);
>  
>  	/*
> -	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
> -	 *
> -	 * Here we make sure if we mark this bg RO, we still have enough
> -	 * free space as buffer.
> +	 * Data never overcommits, even in mixed mode, so do just the straight
> +	 * check of left over space in how much we have allocated.
>  	 */
> -	if (sinfo_used + num_bytes + sinfo->total_bytes) {
> +	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
> +		u64 sinfo_used = btrfs_space_info_used(sinfo, true);

does that mean bytes_may_write is always 0 for DATA?

> +
> +		/*
> +		 * sinfo_used + num_bytes should always <= sinfo->total_bytes.

This invariant warrants an ASSERT.

> +		 *
> +		 * Here we make sure if we mark this bg RO, we still have enough
> +		 * free space as buffer.
> +		 */
> +		if (sinfo_used + num_bytes + sinfo->total_bytes)
> +			ret = 0;
> +	} else {
> +		/*
> +		 * We overcommit metadata, so we need to do the
> +		 * btrfs_can_overcommit check here, and we need to pass in
> +		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
> +		 * leeway to allow us to mark this block group as read only.
> +		 */
> +		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
> +					 BTRFS_RESERVE_NO_FLUSH))
> +			ret = 0;
> +	}
> +
> +	if (!ret) {
>  		sinfo->bytes_readonly += num_bytes;
>  		cache->ro++;
>  		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
> -		ret = 0;
>  	}
>  out:
>  	spin_unlock(&cache->lock);
> @@ -1220,9 +1238,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
>  		btrfs_info(cache->fs_info,
>  			"unable to make block group %llu ro", cache->start);
> -		btrfs_info(cache->fs_info,
> -			"sinfo_used=%llu bg_num_bytes=%llu",
> -			sinfo_used, num_bytes);
>  		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
>  	}
>  	return ret;
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index df5fb68df798..01297c5b2666 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
>  	return (global->size << 1);
>  }
>  
> -static int can_overcommit(struct btrfs_fs_info *fs_info,
> -			  struct btrfs_space_info *space_info, u64 bytes,
> -			  enum btrfs_reserve_flush_enum flush)
> +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
> +			 struct btrfs_space_info *space_info, u64 bytes,
> +			 enum btrfs_reserve_flush_enum flush)
>  {
>  	u64 profile;
>  	u64 avail;
> @@ -226,7 +226,8 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
>  
>  		/* Check and see if our ticket can be satisified now. */
>  		if ((used + ticket->bytes <= space_info->total_bytes) ||
> -		    can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
> +		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
> +					 flush)) {
>  			btrfs_space_info_update_bytes_may_use(fs_info,
>  							      space_info,
>  							      ticket->bytes);
> @@ -639,14 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
>  		return to_reclaim;
>  
>  	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
> -	if (can_overcommit(fs_info, space_info, to_reclaim,
> -			   BTRFS_RESERVE_FLUSH_ALL))
> +	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
> +				 BTRFS_RESERVE_FLUSH_ALL))
>  		return 0;
>  
>  	used = btrfs_space_info_used(space_info, true);
>  
> -	if (can_overcommit(fs_info, space_info, SZ_1M,
> -			   BTRFS_RESERVE_FLUSH_ALL))
> +	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
> +				 BTRFS_RESERVE_FLUSH_ALL))
>  		expected = div_factor_fine(space_info->total_bytes, 95);
>  	else
>  		expected = div_factor_fine(space_info->total_bytes, 90);
> @@ -1005,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  	 */
>  	if (!pending_tickets &&
>  	    ((used + orig_bytes <= space_info->total_bytes) ||
> -	     can_overcommit(fs_info, space_info, orig_bytes, flush))) {
> +	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
>  		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
>  						      orig_bytes);
>  		ret = 0;
> diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
> index 1a349e3f9cc1..24514cd2c6c1 100644
> --- a/fs/btrfs/space-info.h
> +++ b/fs/btrfs/space-info.h
> @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
>  				 enum btrfs_reserve_flush_enum flush);
>  void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
>  				struct btrfs_space_info *space_info);
> +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
> +			 struct btrfs_space_info *space_info, u64 bytes,
> +			 enum btrfs_reserve_flush_enum flush);
>  
>  static inline void btrfs_space_info_free_bytes_may_use(
>  				struct btrfs_fs_info *fs_info,
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] btrfs: use btrfs_can_overcommit in inc_block_group_ro
  2019-11-26  3:00   ` Qu Wenruo
@ 2019-11-26 16:28     ` Josef Bacik
  0 siblings, 0 replies; 18+ messages in thread
From: Josef Bacik @ 2019-11-26 16:28 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Josef Bacik, linux-btrfs, kernel-team, wqu

On Tue, Nov 26, 2019 at 11:00:24AM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/11/25 下午10:40, Josef Bacik wrote:
> > inc_block_group_ro does a calculation to see if we have enough room left
> > over if we mark this block group as read only in order to see if it's ok
> > to mark the block group as read only.
> > 
> > The problem is this calculation _only_ works for data, where our used is
> > always less than our total.  For metadata we will overcommit, so this
> > will almost always fail for metadata.
> > 
> > Fix this by exporting btrfs_can_overcommit, and then see if we have
> > enough space to remove the remaining free space in the block group we
> > are trying to mark read only.  If we do then we can mark this block
> > group as read only.
> 
> This patch is indeed much better than my naive RFC patches.
> 
> Instead of reducing over commit threshold, this just increase the check
> to can_overcommit(), brilliant.
> 
> However some small nitpicks below.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/btrfs/block-group.c | 37 ++++++++++++++++++++++++++-----------
> >  fs/btrfs/space-info.c  | 19 ++++++++++---------
> >  fs/btrfs/space-info.h  |  3 +++
> >  3 files changed, 39 insertions(+), 20 deletions(-)
> > 
> > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > index 3ffbc2e0af21..7b1f6d2b9621 100644
> > --- a/fs/btrfs/block-group.c
> > +++ b/fs/btrfs/block-group.c
> > @@ -1184,7 +1184,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
> Now @force parameter is not used any more.
> 
> We can have a cleanup patch for it.
> 
> >  {
> >  	struct btrfs_space_info *sinfo = cache->space_info;
> >  	u64 num_bytes;
> > -	u64 sinfo_used;
> >  	int ret = -ENOSPC;
> >  
> >  	spin_lock(&sinfo->lock);
> > @@ -1200,19 +1199,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
> >  
> >  	num_bytes = cache->length - cache->reserved - cache->pinned -
> >  		    cache->bytes_super - cache->used;
> > -	sinfo_used = btrfs_space_info_used(sinfo, true);
> >  
> >  	/*
> > -	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
> > -	 *
> > -	 * Here we make sure if we mark this bg RO, we still have enough
> > -	 * free space as buffer.
> > +	 * Data never overcommits, even in mixed mode, so do just the straight
> > +	 * check of left over space in how much we have allocated.
> >  	 */
> > -	if (sinfo_used + num_bytes + sinfo->total_bytes) {
> > +	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
> > +		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
> > +
> > +		/*
> > +		 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
> > +		 *
> > +		 * Here we make sure if we mark this bg RO, we still have enough
> > +		 * free space as buffer.
> > +		 */
> > +		if (sinfo_used + num_bytes + sinfo->total_bytes)
> 
> The same code copied from patch 3 I guess?
> The same typo.
> 
> > +			ret = 0;
> > +	} else {
> > +		/*
> > +		 * We overcommit metadata, so we need to do the
> > +		 * btrfs_can_overcommit check here, and we need to pass in
> > +		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
> > +		 * leeway to allow us to mark this block group as read only.
> > +		 */
> > +		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
> > +					 BTRFS_RESERVE_NO_FLUSH))
> > +			ret = 0;
> > +	}
> 
> For metadata chunks, allocating a new chunk won't change how we do over
> commit calculation.
> 
> I guess we can skip the chunk allocation for metadata chunks in
> btrfs_inc_block_group_ro()?

I _think_ so?  But you are working/testing in this area right now, try and see
how it works out?  I _think_ we'll still need to pre-allocate for data, and I
feel like relocation is an area where we may not be able to allocate new chunks
at certain times, so pre-allocating the metadata chunk may be useful.  System is
another one I would be iffy on.

If we're going to do that I would way rather it be a separate thing and only
after somebody has tested it to death to make sure it's not going to bite us in
the ass.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-27 10:45   ` Qu Wenruo
@ 2019-12-03 19:50     ` David Sterba
  0 siblings, 0 replies; 18+ messages in thread
From: David Sterba @ 2019-12-03 19:50 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Josef Bacik, linux-btrfs, kernel-team, wqu, Nikolay Borisov

On Wed, Nov 27, 2019 at 06:45:25PM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/11/27 上午12:25, Josef Bacik wrote:
> > For some reason we've translated the do_chunk_alloc that goes into
> > btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
> > two different things.
> >
> > force for inc_block_group_ro is used when we are forcing the block group
> > read only no matter what, for example when the underlying chunk is
> > marked read only.  We need to not do the space check here as this block
> > group needs to be read only.
> >
> > btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
> > we need to pre-allocate a chunk before marking the block group read
> > only.  This has nothing to do with forcing, and in fact we _always_ want
> > to do the space check in this case, so unconditionally pass false for
> > force in this case.
> >
> > Then fixup inc_block_group_ro to honor force as it's expected and
> > documented to do.
> >
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> > ---
> >  fs/btrfs/block-group.c | 11 +++++++++--
> >  1 file changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > index 66fa39632cde..5961411500ed 100644
> > --- a/fs/btrfs/block-group.c
> > +++ b/fs/btrfs/block-group.c
> > @@ -1190,8 +1190,15 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
> >  	spin_lock(&sinfo->lock);
> >  	spin_lock(&cache->lock);
> >
> > -	if (cache->ro) {
> > +	if (cache->ro || force) {
> >  		cache->ro++;
> > +
> > +		/*
> > +		 * We should only be empty if we did force here and haven't
> > +		 * already marked ourselves read only.
> > +		 */
> > +		if (force && list_empty(&cache->ro_list))
> > +			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
> >  		ret = 0;
> >  		goto out;
> >  	}
> > @@ -2063,7 +2070,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
> >  		}
> >  	}
> >
> > -	ret = inc_block_group_ro(cache, !do_chunk_alloc);
> > +	ret = inc_block_group_ro(cache, false);
> 
> This is going to make scrub return false ENOSPC.
> 
> Since commit b12de52896c0 ("btrfs: scrub: Don't check free space before
> marking a block group RO"), scrub doesn't do the pre-alloc check at all.
> 
> If there is only one single data chunk, and has some reserved data
> space, we will hit ENOSPC at scrub time.
> 
> 
> That commit is only to prevent unnecessary system chunk preallocation,
> since your next patch is going to make inc_block_group_ro() follow
> metadata over-commit, there is no need for b12de52896c0 anymore.
> 
> You can just revert that commit after your next patch. Or fold the
> revert with next patch, to make bisect easier.

A revert could be problematic in case there are commits that change the
logic (and code), so it would be IMHO better to replace the logic and
then remove the obsoleted code. Bisection should not be intentionally
broken, unless it becomes infeasible to make the code changes
reasonable. IOW if it gets broken, a notice in changelog should do, as
this will be fixed by the commit and the ENOSPC during scrub is only a
transient problem.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-26 16:25 ` [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro Josef Bacik
@ 2019-11-27 10:45   ` Qu Wenruo
  2019-12-03 19:50     ` David Sterba
  0 siblings, 1 reply; 18+ messages in thread
From: Qu Wenruo @ 2019-11-27 10:45 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team, wqu; +Cc: Nikolay Borisov



On 2019/11/27 上午12:25, Josef Bacik wrote:
> For some reason we've translated the do_chunk_alloc that goes into
> btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
> two different things.
>
> force for inc_block_group_ro is used when we are forcing the block group
> read only no matter what, for example when the underlying chunk is
> marked read only.  We need to not do the space check here as this block
> group needs to be read only.
>
> btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
> we need to pre-allocate a chunk before marking the block group read
> only.  This has nothing to do with forcing, and in fact we _always_ want
> to do the space check in this case, so unconditionally pass false for
> force in this case.
>
> Then fixup inc_block_group_ro to honor force as it's expected and
> documented to do.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> ---
>  fs/btrfs/block-group.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 66fa39632cde..5961411500ed 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1190,8 +1190,15 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
>  	spin_lock(&sinfo->lock);
>  	spin_lock(&cache->lock);
>
> -	if (cache->ro) {
> +	if (cache->ro || force) {
>  		cache->ro++;
> +
> +		/*
> +		 * We should only be empty if we did force here and haven't
> +		 * already marked ourselves read only.
> +		 */
> +		if (force && list_empty(&cache->ro_list))
> +			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
>  		ret = 0;
>  		goto out;
>  	}
> @@ -2063,7 +2070,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
>  		}
>  	}
>
> -	ret = inc_block_group_ro(cache, !do_chunk_alloc);
> +	ret = inc_block_group_ro(cache, false);

This is going to make scrub return false ENOSPC.

Since commit b12de52896c0 ("btrfs: scrub: Don't check free space before
marking a block group RO"), scrub doesn't do the pre-alloc check at all.

If there is only one single data chunk, and has some reserved data
space, we will hit ENOSPC at scrub time.


That commit is only to prevent unnecessary system chunk preallocation,
since your next patch is going to make inc_block_group_ro() follow
metadata over-commit, there is no need for b12de52896c0 anymore.

You can just revert that commit after your next patch. Or fold the
revert with next patch, to make bisect easier.

Thanks,
Qu


>  	if (!do_chunk_alloc)
>  		goto unlock_out;
>  	if (!ret)
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro
  2019-11-26 16:25 [PATCH 0/4][v2] clean up how we mark block groups read only Josef Bacik
@ 2019-11-26 16:25 ` Josef Bacik
  2019-11-27 10:45   ` Qu Wenruo
  0 siblings, 1 reply; 18+ messages in thread
From: Josef Bacik @ 2019-11-26 16:25 UTC (permalink / raw)
  To: linux-btrfs, kernel-team, wqu; +Cc: Nikolay Borisov

For some reason we've translated the do_chunk_alloc that goes into
btrfs_inc_block_group_ro to force in inc_block_group_ro, but these are
two different things.

force for inc_block_group_ro is used when we are forcing the block group
read only no matter what, for example when the underlying chunk is
marked read only.  We need to not do the space check here as this block
group needs to be read only.

btrfs_inc_block_group_ro() has a do_chunk_alloc flag that indicates that
we need to pre-allocate a chunk before marking the block group read
only.  This has nothing to do with forcing, and in fact we _always_ want
to do the space check in this case, so unconditionally pass false for
force in this case.

Then fixup inc_block_group_ro to honor force as it's expected and
documented to do.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/block-group.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 66fa39632cde..5961411500ed 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1190,8 +1190,15 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);

-	if (cache->ro) {
+	if (cache->ro || force) {
 		cache->ro++;
+
+		/*
+		 * We should only be empty if we did force here and haven't
+		 * already marked ourselves read only.
+		 */
+		if (force && list_empty(&cache->ro_list))
+			list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 		ret = 0;
 		goto out;
 	}
@@ -2063,7 +2070,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 		}
 	}

-	ret = inc_block_group_ro(cache, !do_chunk_alloc);
+	ret = inc_block_group_ro(cache, false);
 	if (!do_chunk_alloc)
 		goto unlock_out;
 	if (!ret)
-- 
2.23.0

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-12-03 19:50 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-25 14:40 [PATCH 0/4][RFC] clean up how we mark block groups read only Josef Bacik
2019-11-25 14:40 ` [PATCH 1/4] btrfs: don't pass system_chunk into can_overcommit Josef Bacik
2019-11-25 15:36   ` Johannes Thumshirn
2019-11-26  2:32   ` Qu Wenruo
2019-11-26  8:42   ` Nikolay Borisov
2019-11-25 14:40 ` [PATCH 2/4] btrfs: kill min_allocable_bytes in inc_block_group_ro Josef Bacik
2019-11-26  2:35   ` Qu Wenruo
2019-11-25 14:40 ` [PATCH 3/4] btrfs: fix force usage " Josef Bacik
2019-11-26  2:43   ` Qu Wenruo
2019-11-26  4:59     ` Qu Wenruo
2019-11-26 10:09   ` Nikolay Borisov
2019-11-25 14:40 ` [PATCH 4/4] btrfs: use btrfs_can_overcommit " Josef Bacik
2019-11-26  3:00   ` Qu Wenruo
2019-11-26 16:28     ` Josef Bacik
2019-11-26 10:18   ` Nikolay Borisov
2019-11-26 16:25 [PATCH 0/4][v2] clean up how we mark block groups read only Josef Bacik
2019-11-26 16:25 ` [PATCH 3/4] btrfs: fix force usage in inc_block_group_ro Josef Bacik
2019-11-27 10:45   ` Qu Wenruo
2019-12-03 19:50     ` David Sterba

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).