All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
@ 2018-07-19 14:49 Josef Bacik
  2018-07-19 14:49 ` [PATCH 02/22] btrfs: add cleanup_ref_head_accounting helper Josef Bacik
                   ` (24 more replies)
  0 siblings, 25 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We do this dance in cleanup_ref_head and check_ref_cleanup, unify it
into a helper and cleanup the calling functions.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/delayed-ref.c | 14 ++++++++++++++
 fs/btrfs/delayed-ref.h |  3 ++-
 fs/btrfs/extent-tree.c | 24 ++++--------------------
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 03dec673d12a..e1b322d651dd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -393,6 +393,20 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 	return head;
 }
 
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			   struct btrfs_delayed_ref_head *head)
+{
+	lockdep_assert_held(&delayed_refs->lock);
+	lockdep_assert_held(&head->lock);
+
+	rb_erase(&head->href_node, &delayed_refs->href_root);
+	RB_CLEAR_NODE(&head->href_node);
+	atomic_dec(&delayed_refs->num_entries);
+	delayed_refs->num_heads--;
+	if (head->processing == 0)
+		delayed_refs->num_heads_ready--;
+}
+
 /*
  * Helper to insert the ref_node to the tail or merge with tail.
  *
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ea1aecb6a50d..36318182e4ec 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -263,7 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 {
 	mutex_unlock(&head->mutex);
 }
-
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			   struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
 btrfs_select_ref_head(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d9fe58c0080..ccaccd78534e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2577,12 +2577,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		spin_unlock(&delayed_refs->lock);
 		return 1;
 	}
-	delayed_refs->num_heads--;
-	rb_erase(&head->href_node, &delayed_refs->href_root);
-	RB_CLEAR_NODE(&head->href_node);
-	spin_unlock(&head->lock);
+	btrfs_delete_ref_head(delayed_refs, head);
 	spin_unlock(&delayed_refs->lock);
-	atomic_dec(&delayed_refs->num_entries);
+	spin_unlock(&head->lock);
 
 	trace_run_delayed_ref_head(fs_info, head, 0);
 
@@ -7122,22 +7119,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (!mutex_trylock(&head->mutex))
 		goto out;
 
-	/*
-	 * at this point we have a head with no other entries.  Go
-	 * ahead and process it.
-	 */
-	rb_erase(&head->href_node, &delayed_refs->href_root);
-	RB_CLEAR_NODE(&head->href_node);
-	atomic_dec(&delayed_refs->num_entries);
-
-	/*
-	 * we don't take a ref on the node because we're removing it from the
-	 * tree, so we just steal the ref the tree was holding.
-	 */
-	delayed_refs->num_heads--;
-	if (head->processing == 0)
-		delayed_refs->num_heads_ready--;
+	btrfs_delete_ref_head(delayed_refs, head);
 	head->processing = 0;
+
 	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 02/22] btrfs: add cleanup_ref_head_accounting helper
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 03/22] btrfs: use cleanup_extent_op in check_ref_cleanup Josef Bacik
                   ` (23 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We were missing some quota cleanups in check_ref_cleanup, so break the
ref head accounting cleanup into a helper and call that from both
check_ref_cleanup and cleanup_ref_head.  This will hopefully ensure that
we don't screw up accounting in the future for other things that we add.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent-tree.c | 65 +++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ccaccd78534e..cf1152d01309 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2547,6 +2547,40 @@ static int cleanup_extent_op(struct btrfs_trans_handle *trans,
 	return ret ? ret : 1;
 }
 
+static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
+					struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+
+	if (head->total_ref_mod < 0) {
+		struct btrfs_space_info *space_info;
+		u64 flags;
+
+		if (head->is_data)
+			flags = BTRFS_BLOCK_GROUP_DATA;
+		else if (head->is_system)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+		space_info = __find_space_info(fs_info, flags);
+		ASSERT(space_info);
+		percpu_counter_add(&space_info->total_bytes_pinned,
+				   -head->num_bytes);
+
+		if (head->is_data) {
+			spin_lock(&delayed_refs->lock);
+			delayed_refs->pending_csums -= head->num_bytes;
+			spin_unlock(&delayed_refs->lock);
+		}
+	}
+
+	/* Also free its reserved qgroup space */
+	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+				      head->qgroup_reserved);
+}
+
 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_head *head)
@@ -2581,30 +2615,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 	spin_unlock(&head->lock);
 
-	trace_run_delayed_ref_head(fs_info, head, 0);
-
-	if (head->total_ref_mod < 0) {
-		struct btrfs_space_info *space_info;
-		u64 flags;
-
-		if (head->is_data)
-			flags = BTRFS_BLOCK_GROUP_DATA;
-		else if (head->is_system)
-			flags = BTRFS_BLOCK_GROUP_SYSTEM;
-		else
-			flags = BTRFS_BLOCK_GROUP_METADATA;
-		space_info = __find_space_info(fs_info, flags);
-		ASSERT(space_info);
-		percpu_counter_add(&space_info->total_bytes_pinned,
-				   -head->num_bytes);
-
-		if (head->is_data) {
-			spin_lock(&delayed_refs->lock);
-			delayed_refs->pending_csums -= head->num_bytes;
-			spin_unlock(&delayed_refs->lock);
-		}
-	}
-
 	if (head->must_insert_reserved) {
 		btrfs_pin_extent(fs_info, head->bytenr,
 				 head->num_bytes, 1);
@@ -2614,9 +2624,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	/* Also free its reserved qgroup space */
-	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-				      head->qgroup_reserved);
+	cleanup_ref_head_accounting(trans, head);
+
+	trace_run_delayed_ref_head(fs_info, head, 0);
 	btrfs_delayed_ref_unlock(head);
 	btrfs_put_delayed_ref_head(head);
 	return 0;
@@ -7129,6 +7139,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (head->must_insert_reserved)
 		ret = 1;
 
+	cleanup_ref_head_accounting(trans, head);
 	mutex_unlock(&head->mutex);
 	btrfs_put_delayed_ref_head(head);
 	return ret;
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 03/22] btrfs: use cleanup_extent_op in check_ref_cleanup
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
  2018-07-19 14:49 ` [PATCH 02/22] btrfs: add cleanup_ref_head_accounting helper Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates Josef Bacik
                   ` (22 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

Unify the extent_op handling as well, just add a flag so we don't
actually run the extent op from check_ref_cleanup and instead return a
value so that we can skip cleaning up the ref head.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent-tree.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf1152d01309..33cd43dd73ee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2528,19 +2528,24 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref
 }
 
 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info,
-			     struct btrfs_delayed_ref_head *head)
+			     struct btrfs_delayed_ref_head *head,
+			     bool run_extent_op)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	int ret;
 
 	if (!extent_op)
 		return 0;
+
 	head->extent_op = NULL;
 	if (head->must_insert_reserved) {
 		btrfs_free_delayed_extent_op(extent_op);
 		return 0;
+	} else if (!run_extent_op) {
+		return 1;
 	}
+
 	spin_unlock(&head->lock);
 	ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
 	btrfs_free_delayed_extent_op(extent_op);
@@ -2590,7 +2595,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
-	ret = cleanup_extent_op(trans, fs_info, head);
+	ret = cleanup_extent_op(trans, head, true);
 	if (ret < 0) {
 		unselect_delayed_ref_head(delayed_refs, head);
 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
@@ -7115,12 +7120,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (!RB_EMPTY_ROOT(&head->ref_tree))
 		goto out;
 
-	if (head->extent_op) {
-		if (!head->must_insert_reserved)
-			goto out;
-		btrfs_free_delayed_extent_op(head->extent_op);
-		head->extent_op = NULL;
-	}
+	if (cleanup_extent_op(trans, head, false))
+		goto out;
 
 	/*
 	 * waiting for the lock here would deadlock.  If someone else has it
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
  2018-07-19 14:49 ` [PATCH 02/22] btrfs: add cleanup_ref_head_accounting helper Josef Bacik
  2018-07-19 14:49 ` [PATCH 03/22] btrfs: use cleanup_extent_op in check_ref_cleanup Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-21  8:49   ` Nikolay Borisov
  2018-07-19 14:49 ` [PATCH 05/22] btrfs: introduce delayed_refs_rsv Josef Bacik
                   ` (21 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We use this number to figure out how many delayed refs to run, but
__btrfs_run_delayed_refs really only checks every time we need a new
delayed ref head, so we always run at least one ref head completely no
matter what the number of items on it.  So instead track only the ref
heads added by this trans handle and adjust the counting appropriately
in __btrfs_run_delayed_refs.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/delayed-ref.c | 3 ---
 fs/btrfs/extent-tree.c | 5 +----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e1b322d651dd..e5a8fb7ba9f6 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -234,8 +234,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
-	if (trans->delayed_ref_updates)
-		trans->delayed_ref_updates--;
 }
 
 static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -460,7 +458,6 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
 	if (ref->action == BTRFS_ADD_DELAYED_REF)
 		list_add_tail(&ref->add_list, &href->ref_add_list);
 	atomic_inc(&root->num_entries);
-	trans->delayed_ref_updates++;
 	spin_unlock(&href->lock);
 	return ret;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33cd43dd73ee..168312ead4ae 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2667,6 +2667,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				spin_unlock(&delayed_refs->lock);
 				break;
 			}
+			count++;
 
 			/* grab the lock that says we are going to process
 			 * all the refs for this head */
@@ -2680,7 +2681,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			 */
 			if (ret == -EAGAIN) {
 				locked_ref = NULL;
-				count++;
 				continue;
 			}
 		}
@@ -2708,7 +2708,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			unselect_delayed_ref_head(delayed_refs, locked_ref);
 			locked_ref = NULL;
 			cond_resched();
-			count++;
 			continue;
 		}
 
@@ -2726,7 +2725,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 			locked_ref = NULL;
-			count++;
 			continue;
 		}
 
@@ -2777,7 +2775,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_put_delayed_ref(ref);
-		count++;
 		cond_resched();
 	}
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 05/22] btrfs: introduce delayed_refs_rsv
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (2 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 06/22] btrfs: check if free bgs for commit Josef Bacik
                   ` (20 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv.  This works most
of the time, except when it doesn't.  We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction.  Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.

So instead give them their own block_rsv.  This way we can always know
exactly how much outstanding space we need for delayed refs.  This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system.  Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.

For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums.  We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h             |  24 +++-
 fs/btrfs/delayed-ref.c       |  28 ++++-
 fs/btrfs/disk-io.c           |   5 +
 fs/btrfs/extent-tree.c       | 270 +++++++++++++++++++++++++++++++++++--------
 fs/btrfs/transaction.c       |  68 +++++------
 include/trace/events/btrfs.h |   2 +
 6 files changed, 297 insertions(+), 100 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 118346aceea9..b33f851fb97d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -448,8 +448,9 @@ struct btrfs_space_info {
 #define	BTRFS_BLOCK_RSV_TRANS		3
 #define	BTRFS_BLOCK_RSV_CHUNK		4
 #define	BTRFS_BLOCK_RSV_DELOPS		5
-#define	BTRFS_BLOCK_RSV_EMPTY		6
-#define	BTRFS_BLOCK_RSV_TEMP		7
+#define BTRFS_BLOCK_RSV_DELREFS		6
+#define	BTRFS_BLOCK_RSV_EMPTY		7
+#define	BTRFS_BLOCK_RSV_TEMP		8
 
 struct btrfs_block_rsv {
 	u64 size;
@@ -790,6 +791,8 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv chunk_block_rsv;
 	/* block reservation for delayed operations */
 	struct btrfs_block_rsv delayed_block_rsv;
+	/* block reservation for delayed refs */
+	struct btrfs_block_rsv delayed_refs_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
@@ -2748,10 +2751,12 @@ enum btrfs_reserve_flush_enum {
 enum btrfs_flush_state {
 	FLUSH_DELAYED_ITEMS_NR	=	1,
 	FLUSH_DELAYED_ITEMS	=	2,
-	FLUSH_DELALLOC		=	3,
-	FLUSH_DELALLOC_WAIT	=	4,
-	ALLOC_CHUNK		=	5,
-	COMMIT_TRANS		=	6,
+	FLUSH_DELAYED_REFS_NR	=	3,
+	FLUSH_DELAYED_REFS	=	4,
+	FLUSH_DELALLOC		=	5,
+	FLUSH_DELALLOC_WAIT	=	6,
+	ALLOC_CHUNK		=	7,
+	COMMIT_TRANS		=	8,
 };
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -2803,6 +2808,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_refill_delayed_block_rsv(struct btrfs_fs_info *fs_info,
+				   enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_block_rsv(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_rsv *src,
+					u64 num_bytes);
 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_group_cache *cache);
 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e5a8fb7ba9f6..1ab12e8876ed 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
  * existing and update must have the same bytenr
  */
 static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+update_existing_head_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_delayed_ref_head *existing,
 			 struct btrfs_delayed_ref_head *update,
 			 int *old_ref_mod_ret)
 {
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int old_ref_mod;
 
 	BUG_ON(existing->is_data != update->is_data);
@@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	 * versa we need to make sure to adjust pending_csums accordingly.
 	 */
 	if (existing->is_data) {
-		if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+		u64 csum_items =
+			btrfs_csum_bytes_to_leaves(fs_info,
+						   existing->num_bytes);
+
+		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
 			delayed_refs->pending_csums -= existing->num_bytes;
-		if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+			btrfs_delayed_refs_rsv_release(fs_info, csum_items);
+		}
+		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
 			delayed_refs->pending_csums += existing->num_bytes;
+			trans->delayed_ref_updates += csum_items;
+		}
 	}
 	spin_unlock(&existing->lock);
 }
@@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 			&& head_ref->qgroup_reserved
 			&& existing->qgroup_ref_root
 			&& existing->qgroup_reserved);
-		update_existing_head_ref(delayed_refs, existing, head_ref,
+		update_existing_head_ref(trans, existing, head_ref,
 					 old_ref_mod);
 		/*
 		 * we've updated the existing ref, free the newly
@@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	} else {
 		if (old_ref_mod)
 			*old_ref_mod = 0;
-		if (head_ref->is_data && head_ref->ref_mod < 0)
+		if (head_ref->is_data && head_ref->ref_mod < 0) {
 			delayed_refs->pending_csums += head_ref->num_bytes;
+			trans->delayed_ref_updates +=
+				btrfs_csum_bytes_to_leaves(trans->fs_info,
+							   head_ref->num_bytes);
+		}
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
@@ -779,6 +794,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 
 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
+	btrfs_update_delayed_refs_rsv(trans);
 
 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
 				   action == BTRFS_ADD_DELAYED_EXTENT ?
@@ -867,6 +883,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
+	btrfs_update_delayed_refs_rsv(trans);
 
 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
 				   action == BTRFS_ADD_DELAYED_EXTENT ?
@@ -904,6 +921,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 			     NULL, NULL, NULL);
 
 	spin_unlock(&delayed_refs->lock);
+	btrfs_update_delayed_refs_rsv(trans);
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 205092dc9390..6e5347fc3604 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2737,6 +2737,9 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
 	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
 			     BTRFS_BLOCK_RSV_DELOPS);
+	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+			     BTRFS_BLOCK_RSV_DELREFS);
+
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->defrag_running, 0);
 	atomic_set(&fs_info->qgroup_op_seq, 0);
@@ -2956,6 +2959,8 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		printk(KERN_ERR "WE HAVE MIXED BGS\n");
 	/*
 	 * Needn't use the lock because there is no other task which will
 	 * update the flag.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 168312ead4ae..77f6f6bff36e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2558,6 +2558,7 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs =
 		&trans->transaction->delayed_refs;
+	int nr_items = 1;
 
 	if (head->total_ref_mod < 0) {
 		struct btrfs_space_info *space_info;
@@ -2578,12 +2579,15 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
 			spin_lock(&delayed_refs->lock);
 			delayed_refs->pending_csums -= head->num_bytes;
 			spin_unlock(&delayed_refs->lock);
+			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
+				head->num_bytes);
 		}
 	}
 
 	/* Also free its reserved qgroup space */
 	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 				      head->qgroup_reserved);
+	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
 }
 
 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
@@ -2880,37 +2884,20 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 				       struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_block_rsv *global_rsv;
-	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
-	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
-	u64 num_bytes, num_dirty_bgs_bytes;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	u64 reserved;
 	int ret = 0;
 
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-	num_heads = heads_to_leaves(fs_info, num_heads);
-	if (num_heads > 1)
-		num_bytes += (num_heads - 1) * fs_info->nodesize;
-	num_bytes <<= 1;
-	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
-							fs_info->nodesize;
-	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
-							     num_dirty_bgs);
-	global_rsv = &fs_info->global_block_rsv;
-
-	/*
-	 * If we can't allocate any more chunks lets make sure we have _lots_ of
-	 * wiggle room since running delayed refs can create more delayed refs.
-	 */
-	if (global_rsv->space_info->full) {
-		num_dirty_bgs_bytes <<= 1;
-		num_bytes <<= 1;
-	}
-
 	spin_lock(&global_rsv->lock);
-	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
-		ret = 1;
+	reserved = global_rsv->reserved;
 	spin_unlock(&global_rsv->lock);
+
+	spin_lock(&delayed_refs_rsv->lock);
+	reserved += delayed_refs_rsv->reserved;
+	if (delayed_refs_rsv->size >= reserved)
+		ret = 1;
+	spin_unlock(&delayed_refs_rsv->lock);
 	return ret;
 }
 
@@ -3691,6 +3678,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 	 */
 	mutex_lock(&trans->transaction->cache_write_mutex);
 	while (!list_empty(&dirty)) {
+		bool drop_reserve = true;
+
 		cache = list_first_entry(&dirty,
 					 struct btrfs_block_group_cache,
 					 dirty_list);
@@ -3763,6 +3752,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 					list_add_tail(&cache->dirty_list,
 						      &cur_trans->dirty_bgs);
 					btrfs_get_block_group(cache);
+					drop_reserve = false;
 				}
 				spin_unlock(&cur_trans->dirty_bgs_lock);
 			} else if (ret) {
@@ -3773,6 +3763,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 		/* if its not on the io list, we need to put the block group */
 		if (should_put)
 			btrfs_put_block_group(cache);
+		if (drop_reserve)
+			btrfs_delayed_refs_rsv_release(fs_info, 1);
 
 		if (ret)
 			break;
@@ -3921,6 +3913,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		/* if its not on the io list, we need to put the block group */
 		if (should_put)
 			btrfs_put_block_group(cache);
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4900,8 +4893,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 {
 	struct reserve_ticket *ticket = NULL;
 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_trans_handle *trans;
 	u64 bytes;
+	u64 reclaim_bytes = 0;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	if (trans)
@@ -4933,12 +4928,16 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 		return -ENOSPC;
 
 	spin_lock(&delayed_rsv->lock);
-	if (delayed_rsv->size > bytes)
-		bytes = 0;
-	else
-		bytes -= delayed_rsv->size;
+	reclaim_bytes += delayed_rsv->reserved;
 	spin_unlock(&delayed_rsv->lock);
 
+	spin_lock(&delayed_refs_rsv->lock);
+	reclaim_bytes += delayed_refs_rsv->reserved;
+	spin_unlock(&delayed_refs_rsv->lock);
+	if (reclaim_bytes >= bytes)
+		goto commit;
+	bytes -= reclaim_bytes;
+
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
 				   bytes) < 0) {
 		return -ENOSPC;
@@ -4987,6 +4986,20 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 				state == FLUSH_DELALLOC_WAIT);
 		break;
+	case FLUSH_DELAYED_REFS_NR:
+	case FLUSH_DELAYED_REFS:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		if (state == FLUSH_DELAYED_REFS_NR)
+			nr = calc_reclaim_items_nr(fs_info, num_bytes);
+		else
+			nr = 0;
+		btrfs_run_delayed_refs(trans, nr);
+		btrfs_end_transaction(trans);
+		break;
 	case ALLOC_CHUNK:
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
@@ -5459,6 +5472,93 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+/**
+ * btrfs_migrate_to_delayed_block_rsv - transfer bytes to our delayed refs rsv.
+ * @fs_info - the fs info for our fs.
+ * @src - the source block rsv to transfer from.
+ * @num_bytes - the number of bytes to transfer.
+ *
+ * This transfers up to the num_bytes amount from the src rsv to the
+ * delayed_refs_rsv.  Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_block_rsv(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_rsv *src,
+					u64 num_bytes)
+{
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	u64 to_free = 0;
+
+	spin_lock(&src->lock);
+	src->reserved -= num_bytes;
+	src->size -= num_bytes;
+	spin_unlock(&src->lock);
+
+	spin_lock(&delayed_refs_rsv->lock);
+	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+		u64 delta = delayed_refs_rsv->size -
+			delayed_refs_rsv->reserved;
+		if (num_bytes > delta) {
+			to_free = num_bytes - delta;
+			num_bytes = delta;
+		}
+	} else {
+		to_free = num_bytes;
+		num_bytes = 0;
+	}
+
+	if (num_bytes)
+		delayed_refs_rsv->reserved += num_bytes;
+	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+		delayed_refs_rsv->full = 1;
+	spin_unlock(&delayed_refs_rsv->lock);
+
+	if (num_bytes)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, num_bytes, 1);
+	if (to_free)
+		space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
+					 to_free);
+}
+
+/**
+ * btrfs_refill_delayed_block_rsv - refill the delayed block rsv.
+ * @fs_info - the fs_info for our fs.
+ * @flush - control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_refill_delayed_block_rsv(struct btrfs_fs_info *fs_info,
+				   enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+	u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved < block_rsv->size) {
+		num_bytes = block_rsv->size - block_rsv->reserved;
+		num_bytes = min(num_bytes, limit);
+	}
+	spin_unlock(&block_rsv->lock);
+
+	if (!num_bytes)
+		return 0;
+
+	ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+				     num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
+
 /*
  * This is for space we already have accounted in space_info->bytes_may_use, so
  * basically when we're returning space from block_rsv's.
@@ -5786,6 +5886,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 	return ret;
 }
 
+static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+				     struct btrfs_block_rsv *block_rsv,
+				     u64 num_bytes, u64 *qgroup_to_release)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *target = delayed_rsv;
+
+	if (target->full || target == block_rsv)
+		target = global_rsv;
+
+	if (block_rsv->space_info != target->space_info)
+		target = NULL;
+
+	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+				       qgroup_to_release);
+}
+
+void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes)
+{
+	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+}
+
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
  * @inode - the inode we need to release from.
@@ -5800,7 +5925,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 released = 0;
 	u64 qgroup_to_release = 0;
@@ -5810,8 +5934,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 	 * are releasing 0 bytes, and then we'll just get the reservation over
 	 * the size free'd.
 	 */
-	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
-					   &qgroup_to_release);
+	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
+					     &qgroup_to_release);
 	if (released > 0)
 		trace_btrfs_space_reservation(fs_info, "delalloc",
 					      btrfs_ino(inode), released, 0);
@@ -5822,16 +5946,25 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 						   qgroup_to_release);
 }
 
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *block_rsv,
-			     u64 num_bytes)
+/**
+ * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
+ * @fs_info - the fs_info for our fs.
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and free's
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 {
+	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
+	u64 released = 0;
 
-	if (global_rsv == block_rsv ||
-	    block_rsv->space_info != global_rsv->space_info)
-		global_rsv = NULL;
-	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
+	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
+					   num_bytes, NULL);
+	if (released)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
 }
 
 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5896,9 +6029,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
 	fs_info->delayed_block_rsv.space_info = space_info;
+	fs_info->delayed_refs_rsv.space_info = space_info;
 
-	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
+	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 	if (fs_info->quota_root)
@@ -5918,8 +6052,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
 }
 
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	u64 num_bytes;
+
+	if (!trans->delayed_ref_updates)
+		return;
+
+	num_bytes = btrfs_calc_trans_metadata_size(fs_info,
+						   trans->delayed_ref_updates);
+	spin_lock(&delayed_rsv->lock);
+	delayed_rsv->size += num_bytes;
+	delayed_rsv->full = 0;
+	spin_unlock(&delayed_rsv->lock);
+	trans->delayed_ref_updates = 0;
+}
 
 /*
  * To be called after all the new block groups attached to the transaction
@@ -6213,6 +6373,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 	int factor;
+	int ret = 0;
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_root_lock);
@@ -6226,8 +6387,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
-		if (!cache)
-			return -ENOENT;
+		if (!cache) {
+			ret = -ENOENT;
+			break;
+		}
 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
 				    BTRFS_BLOCK_GROUP_RAID1 |
 				    BTRFS_BLOCK_GROUP_RAID10))
@@ -6288,7 +6451,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (list_empty(&cache->dirty_list)) {
 			list_add_tail(&cache->dirty_list,
 				      &trans->transaction->dirty_bgs);
-				trans->transaction->num_dirty_bgs++;
+			trans->transaction->num_dirty_bgs++;
+			trans->delayed_ref_updates++;
 			btrfs_get_block_group(cache);
 		}
 		spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -6314,7 +6478,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		total -= num_bytes;
 		bytenr += num_bytes;
 	}
-	return 0;
+	btrfs_update_delayed_refs_rsv(trans);
+	return ret;
 }
 
 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
@@ -8354,7 +8519,12 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		goto again;
 	}
 
-	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+	/*
+	 * The global reserve still exists to save us from ourselves, so don't
+	 * warn_on if we are short on our delayed refs reserve.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 		static DEFINE_RATELIMIT_STATE(_rs,
 				DEFAULT_RATELIMIT_INTERVAL * 10,
 				/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -10320,6 +10490,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	int factor;
 	struct btrfs_caching_control *caching_ctl = NULL;
 	bool remove_em;
+	bool remove_rsv = false;
 
 	block_group = btrfs_lookup_block_group(fs_info, group_start);
 	BUG_ON(!block_group);
@@ -10389,6 +10560,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	if (!list_empty(&block_group->dirty_list)) {
 		list_del_init(&block_group->dirty_list);
+		remove_rsv = true;
 		btrfs_put_block_group(block_group);
 	}
 	spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -10598,6 +10770,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_item(trans, root, path);
 out:
+	if (remove_rsv)
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ff5f6c719976..337397f99c95 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -455,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 		  bool enforce_qgroups)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
@@ -484,6 +484,9 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items && root != fs_info->chunk_root) {
+		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
+		u64 delayed_refs_bytes = 0;
+
 		qgroup_reserved = num_items * fs_info->nodesize;
 		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
 				enforce_qgroups);
@@ -491,6 +494,11 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 			return ERR_PTR(ret);
 
 		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+		if (delayed_refs_rsv->full == 0) {
+			delayed_refs_bytes = num_bytes;
+			num_bytes <<= 1;
+		}
+
 		/*
 		 * Do the reservation for the relocation root creation
 		 */
@@ -499,8 +507,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 			reloc_reserved = true;
 		}
 
-		ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
-					  num_bytes, flush);
+		ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+		if (ret)
+			goto reserve_fail;
+		if (delayed_refs_bytes) {
+			btrfs_migrate_to_delayed_block_rsv(fs_info, rsv,
+							   delayed_refs_bytes);
+			num_bytes -= delayed_refs_bytes;
+		}
+	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
+		   !delayed_refs_rsv->full) {
+		/*
+		 * Some people call with btrfs_start_transaction(root, 0)
+		 * because they can be throttled, but have some other mechanism
+		 * for reserving space.  We still want these guys to refill the
+		 * delayed block_rsv so just add 1 items worth of reservation
+		 * here.
+		 */
+		ret = btrfs_refill_delayed_block_rsv(fs_info, flush);
 		if (ret)
 			goto reserve_fail;
 	}
@@ -768,22 +792,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	int updates;
-	int err;
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
 	    cur_trans->delayed_refs.flushing)
 		return 1;
 
-	updates = trans->delayed_ref_updates;
-	trans->delayed_ref_updates = 0;
-	if (updates) {
-		err = btrfs_run_delayed_refs(trans, updates * 2);
-		if (err) /* Error code will also eval true */
-			return err;
-	}
-
 	return should_end_transaction(trans);
 }
 
@@ -813,11 +827,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	u64 transid = trans->transid;
-	unsigned long cur = trans->delayed_ref_updates;
 	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
-	int must_run_delayed_refs = 0;
 
 	if (refcount_read(&trans->use_count) > 1) {
 		refcount_dec(&trans->use_count);
@@ -828,27 +839,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
-	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans);
-
-	trans->delayed_ref_updates = 0;
-	if (!trans->sync) {
-		must_run_delayed_refs =
-			btrfs_should_throttle_delayed_refs(trans, info);
-		cur = max_t(unsigned long, cur, 32);
-
-		/*
-		 * don't make the caller wait if they are from a NOLOCK
-		 * or ATTACH transaction, it will deadlock with commit
-		 */
-		if (must_run_delayed_refs == 1 &&
-		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
-			must_run_delayed_refs = 2;
-	}
-
-	btrfs_trans_release_metadata(trans);
-	trans->block_rsv = NULL;
-
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans);
 
@@ -893,10 +883,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-	if (must_run_delayed_refs) {
-		btrfs_async_run_delayed_refs(info, cur, transid,
-					     must_run_delayed_refs == 1);
-	}
 	return err;
 }
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 39b94ec965be..c0c536d8848a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1049,6 +1049,8 @@ TRACE_EVENT(btrfs_trigger_flush,
 		{ FLUSH_DELAYED_ITEMS,		"FLUSH_DELAYED_ITEMS"},		\
 		{ FLUSH_DELALLOC,		"FLUSH_DELALLOC"},		\
 		{ FLUSH_DELALLOC_WAIT,		"FLUSH_DELALLOC_WAIT"},		\
+		{ FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR"},	\
+		{ FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS"},		\
 		{ ALLOC_CHUNK,			"ALLOC_CHUNK"},			\
 		{ COMMIT_TRANS,			"COMMIT_TRANS"})
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 06/22] btrfs: check if free bgs for commit
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (3 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 05/22] btrfs: introduce delayed_refs_rsv Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc Josef Bacik
                   ` (19 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

may_commit_transaction will skip committing the transaction if we don't
have enough pinned space or if we're trying to find space for a SYSTEM
chunk.  However if we have pending free block groups in this transaction
we still want to commit as we may be able to allocate a chunk to make
our reservation.  So instead of just returning ENOSPC, check if we have
free block groups pending, and if so commit the transaction to allow us
to use that free space.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 77f6f6bff36e..523bc197c40b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4897,6 +4897,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 	struct btrfs_trans_handle *trans;
 	u64 bytes;
 	u64 reclaim_bytes = 0;
+	bool do_commit = true;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	if (trans)
@@ -4924,8 +4925,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 	 * See if there is some space in the delayed insertion reservation for
 	 * this reservation.
 	 */
-	if (space_info != delayed_rsv->space_info)
-		return -ENOSPC;
+	if (space_info != delayed_rsv->space_info) {
+		do_commit = false;
+		goto commit;
+	}
 
 	spin_lock(&delayed_rsv->lock);
 	reclaim_bytes += delayed_rsv->reserved;
@@ -4939,15 +4942,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 	bytes -= reclaim_bytes;
 
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes) < 0) {
-		return -ENOSPC;
-	}
-
+				   bytes) < 0)
+		do_commit = false;
 commit:
 	trans = btrfs_join_transaction(fs_info->extent_root);
 	if (IS_ERR(trans))
 		return -ENOSPC;
 
+	if (!do_commit &&
+	    !test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags)) {
+		btrfs_end_transaction(trans);
+		return -ENOSPC;
+	}
 	return btrfs_commit_transaction(trans);
 }
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (4 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 06/22] btrfs: check if free bgs for commit Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 15:43   ` Nikolay Borisov
  2018-07-19 14:49 ` [PATCH 08/22] btrfs: dump block_rsv whe dumping space info Josef Bacik
                   ` (18 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

If we're trying to make a data reservation and we have to allocate a
data chunk we could leak ret == 1, as do_chunk_alloc() will return 1 if
it allocated a chunk.  Since the end of the function is the success path
just return 0.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 523bc197c40b..6de9a180abdd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4360,7 +4360,7 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
-	return ret;
+	return 0;
 }
 
 int btrfs_check_data_free_space(struct inode *inode,
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 08/22] btrfs: dump block_rsv whe dumping space info
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (5 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 15:39   ` Nikolay Borisov
  2018-07-20 11:59   ` David Sterba
  2018-07-19 14:49 ` [PATCH 09/22] btrfs: release metadata before running delayed refs Josef Bacik
                   ` (17 subsequent siblings)
  24 siblings, 2 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

For enospc_debug having the block rsvs is super helpful to see if we've
done something wrong.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6de9a180abdd..35c1c1b7ba1d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8050,6 +8050,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+static void dump_block_rsv(struct btrfs_block_rsv *rsv)
+{
+	spin_lock(&rsv->lock);
+	printk(KERN_ERR "%d: size %llu reserved %llu\n",
+	       rsv->type, (unsigned long long)rsv->size,
+	       (unsigned long long)rsv->reserved);
+	spin_unlock(&rsv->lock);
+}
+
 static void dump_space_info(struct btrfs_fs_info *fs_info,
 			    struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups)
@@ -8069,6 +8078,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
 		info->bytes_readonly);
 	spin_unlock(&info->lock);
 
+	dump_block_rsv(&fs_info->global_block_rsv);
+	dump_block_rsv(&fs_info->trans_block_rsv);
+	dump_block_rsv(&fs_info->chunk_block_rsv);
+	dump_block_rsv(&fs_info->delayed_block_rsv);
+	dump_block_rsv(&fs_info->delayed_refs_rsv);
+
 	if (!dump_block_groups)
 		return;
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 09/22] btrfs: release metadata before running delayed refs
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (6 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 08/22] btrfs: dump block_rsv whe dumping space info Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS Josef Bacik
                   ` (16 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We want to release the unused reservation we have since it refills the
delayed refs reserve, which will make everything go smoother when
running the delayed refs if we're short on our reservation.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/transaction.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 337397f99c95..4b171d8a7554 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1916,6 +1916,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		return ret;
 	}
 
+	btrfs_trans_release_metadata(trans);
+	trans->block_rsv = NULL;
+
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
@@ -1925,9 +1928,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		return ret;
 	}
 
-	btrfs_trans_release_metadata(trans);
-	trans->block_rsv = NULL;
-
 	cur_trans = trans->transaction;
 
 	/*
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (7 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 09/22] btrfs: release metadata before running delayed refs Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 15:35   ` Nikolay Borisov
  2018-07-19 14:49 ` [PATCH 11/22] btrfs: fix truncate throttling Josef Bacik
                   ` (15 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

If we're allocating a new space cache inode it's likely going to be
under a transaction handle, so we need to use GFP_NOFS to keep from
deadlocking.  Otherwise GFP_KERNEL is fine.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/free-space-cache.c | 5 +++++
 fs/btrfs/inode.c            | 3 ++-
 fs/btrfs/transaction.h      | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d5f80cb300be..13bc514e4e16 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -68,7 +68,12 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 	btrfs_disk_key_to_cpu(&location, &disk_key);
 	btrfs_release_path(path);
 
+	/* We need this set so that we use GFP_NOFS when allocating our inode. */
+	if (current->journal_info == NULL)
+		current->journal_info = BTRFS_TRANS_STUB;
 	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+	if (current->journal_info == BTRFS_TRANS_STUB)
+		current->journal_info = NULL;
 	if (IS_ERR(inode))
 		return inode;
 	if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eba61bcb9bb3..14ecfe5d6110 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9211,8 +9211,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_inode *ei;
 	struct inode *inode;
+	gfp_t flags = (current->journal_info) ? GFP_NOFS : GFP_KERNEL;
 
-	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(btrfs_inode_cachep, flags);
 	if (!ei)
 		return NULL;
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94439482a0ec..172ff923bf15 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -106,6 +106,7 @@ struct btrfs_transaction {
 #define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
 
 #define BTRFS_SEND_TRANS_STUB	((void *)1)
+#define BTRFS_TRANS_STUB	((void *)2)
 
 struct btrfs_trans_handle {
 	u64 transid;
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 11/22] btrfs: fix truncate throttling
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (8 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 12/22] btrfs: don't use global rsv for chunk allocation Josef Bacik
                   ` (14 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We have a bunch of magic to make sure we're throttling delayed refs when
truncating a file.  Now that we have a delayed refs rsv and a mechanism
for refilling that reserve simply use that instead of all of this magic.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/inode.c | 78 ++++++++++++--------------------------------------------
 1 file changed, 16 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 14ecfe5d6110..062807baf2ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4481,31 +4481,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return err;
 }
 
-static int truncate_space_check(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytes_deleted)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	int ret;
-
-	/*
-	 * This is only used to apply pressure to the enospc system, we don't
-	 * intend to use this reservation at all.
-	 */
-	bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
-	bytes_deleted *= fs_info->nodesize;
-	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
-				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
-	if (!ret) {
-		trace_btrfs_space_reservation(fs_info, "transaction",
-					      trans->transid,
-					      bytes_deleted, 1);
-		trans->bytes_reserved += bytes_deleted;
-	}
-	return ret;
-
-}
-
 /*
  * Return this if we need to call truncate_block for the last bit of the
  * truncate.
@@ -4550,7 +4525,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
 	bool should_throttle = false;
-	bool should_end = false;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -4763,15 +4737,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
-			if (btrfs_should_throttle_delayed_refs(trans, fs_info))
-				btrfs_async_run_delayed_refs(fs_info,
-					trans->delayed_ref_updates * 2,
-					trans->transid, 0);
 			if (be_nice) {
-				if (truncate_space_check(trans, root,
-							 extent_num_bytes)) {
-					should_end = true;
-				}
 				if (btrfs_should_throttle_delayed_refs(trans,
 								       fs_info))
 					should_throttle = true;
@@ -4783,7 +4749,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot ||
-		    should_throttle || should_end) {
+		    should_throttle) {
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -4795,23 +4761,23 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				pending_del_nr = 0;
 			}
 			btrfs_release_path(path);
-			if (should_throttle) {
-				unsigned long updates = trans->delayed_ref_updates;
-				if (updates) {
-					trans->delayed_ref_updates = 0;
-					ret = btrfs_run_delayed_refs(trans,
-								   updates * 2);
-					if (ret)
-						break;
-				}
-			}
+
 			/*
-			 * if we failed to refill our space rsv, bail out
-			 * and let the transaction restart
+			 * We can generate a lot of delayed refs, so we need to
+			 * throttle every once and a while and make sure we're
+			 * adding enough space to keep up with the work we are
+			 * generating.  Since we hold a transaction here we can
+			 * only FLUSH_LIMIT, if this fails we just return EAGAIN
+			 * and let the normal space allocation stuff do it's
+			 * work.
 			 */
-			if (should_end) {
-				ret = -EAGAIN;
-				break;
+			if (should_throttle) {
+				ret = btrfs_refill_delayed_block_rsv(fs_info,
+							BTRFS_RESERVE_FLUSH_LIMIT);
+				if (ret) {
+					ret = -EAGAIN;
+					break;
+				}
 			}
 			goto search_again;
 		} else {
@@ -4837,18 +4803,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_free_path(path);
-
-	if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
-		unsigned long updates = trans->delayed_ref_updates;
-		int err;
-
-		if (updates) {
-			trans->delayed_ref_updates = 0;
-			err = btrfs_run_delayed_refs(trans, updates * 2);
-			if (err)
-				ret = err;
-		}
-	}
 	return ret;
 }
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 12/22] btrfs: don't use global rsv for chunk allocation
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (9 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 11/22] btrfs: fix truncate throttling Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 13/22] btrfs: reset max_extent_size properly Josef Bacik
                   ` (13 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We've done this forever because of the voodoo around knowing how much
space we have.  However we have better ways of doing this now, and on
normal file systems we'll easily have a global reserve of 512MiB, and
since metadata chunks are usually 1GiB that means we'll allocate
metadata chunks more readily.  Instead use the actual used amount when
determining if we need to allocate a chunk or not.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35c1c1b7ba1d..c20675c265c5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4459,21 +4459,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 			      struct btrfs_space_info *sinfo, int force)
 {
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
 	u64 thresh;
 
 	if (force == CHUNK_ALLOC_FORCE)
 		return 1;
 
-	/*
-	 * We need to take into account the global rsv because for all intents
-	 * and purposes it's used space.  Don't worry about locking the
-	 * global_rsv, it doesn't change except when the transaction commits.
-	 */
-	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-		bytes_used += calc_global_rsv_need_space(global_rsv);
-
 	/*
 	 * in limited mode, we want to have some free space up to
 	 * about 1% of the FS size.
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 13/22] btrfs: reset max_extent_size properly
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (10 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 12/22] btrfs: don't use global rsv for chunk allocation Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 14/22] btrfs: don't enospc all tickets on flush failure Josef Bacik
                   ` (12 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

If we use up our block group before allocating a new one we'll easily
get a max_extent_size that's set really really low, which will result in
a lot of fragmentation.  We need to make sure we're resetting the
max_extent_size when we add a new chunk or add new space.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c20675c265c5..e057d9b2d959 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4657,6 +4657,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			goto out;
 	} else {
 		ret = 1;
+		space_info->max_extent_size = 0;
 	}
 
 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -8199,10 +8200,16 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 	if (pin)
 		pin_down_extent(fs_info, cache, start, len, 1);
 	else {
+		struct btrfs_space_info *space_info = cache->space_info;
+
 		if (btrfs_test_opt(fs_info, DISCARD))
 			ret = btrfs_discard_extent(fs_info, start, len, NULL);
 		btrfs_add_free_space(cache, start, len);
 		btrfs_free_reserved_bytes(cache, len, delalloc);
+
+		spin_lock(&space_info->lock);
+		space_info->max_extent_size = 0;
+		spin_unlock(&space_info->lock);
 		trace_btrfs_reserved_extent_free(fs_info, start, len);
 	}
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 14/22] btrfs: don't enospc all tickets on flush failure
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (11 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 13/22] btrfs: reset max_extent_size properly Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-07-19 14:49 ` [PATCH 15/22] btrfs: run delayed iputs before committing Josef Bacik
                   ` (11 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

With the introduction of the per-inode block_rsv it became possible to
have really really large reservation requests made because of data
fragmentation.  Since the ticket stuff assumed that we'd always have
relatively small reservation requests it just killed all tickets if we
were unable to satisfy the current request.  However this is generally
not the case anymore.  So fix this logic to instead see if we had a
ticket that we were able to give some reservation to, and if we were
continue the flushing loop again.  Likewise we make the tickets use the
space_info_add_old_bytes() method of returning what reservation they did
receive in hopes that it could satisfy reservations down the line.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e057d9b2d959..355884445f38 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4864,6 +4864,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 }
 
 struct reserve_ticket {
+	u64 orig_bytes;
 	u64 bytes;
 	int error;
 	struct list_head list;
@@ -5081,7 +5082,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
 {
 	struct reserve_ticket *ticket;
 
@@ -5090,7 +5091,10 @@ static void wake_all_tickets(struct list_head *head)
 		list_del_init(&ticket->list);
 		ticket->error = -ENOSPC;
 		wake_up(&ticket->wait);
+		if (ticket->bytes != ticket->orig_bytes)
+			return true;
 	}
+	return false;
 }
 
 /*
@@ -5145,8 +5149,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		if (flush_state > COMMIT_TRANS) {
 			commit_cycles++;
 			if (commit_cycles > 2) {
-				wake_all_tickets(&space_info->tickets);
-				space_info->flush = 0;
+				if (wake_all_tickets(&space_info->tickets)) {
+					flush_state = FLUSH_DELAYED_ITEMS_NR;
+					commit_cycles--;
+				} else {
+					space_info->flush = 0;
+				}
 			} else {
 				flush_state = FLUSH_DELAYED_ITEMS_NR;
 			}
@@ -5198,10 +5206,11 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 
 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 			       struct btrfs_space_info *space_info,
-			       struct reserve_ticket *ticket, u64 orig_bytes)
+			       struct reserve_ticket *ticket)
 
 {
 	DEFINE_WAIT(wait);
+	u64 reclaim_bytes = 0;
 	int ret = 0;
 
 	spin_lock(&space_info->lock);
@@ -5222,14 +5231,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 		ret = ticket->error;
 	if (!list_empty(&ticket->list))
 		list_del_init(&ticket->list);
-	if (ticket->bytes && ticket->bytes < orig_bytes) {
-		u64 num_bytes = orig_bytes - ticket->bytes;
-		space_info->bytes_may_use -= num_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      space_info->flags, num_bytes, 0);
-	}
+	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
 	spin_unlock(&space_info->lock);
 
+	if (reclaim_bytes)
+		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
 	return ret;
 }
 
@@ -5255,6 +5262,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 {
 	struct reserve_ticket ticket;
 	u64 used;
+	u64 reclaim_bytes = 0;
 	int ret = 0;
 
 	ASSERT(orig_bytes);
@@ -5290,6 +5298,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	 * the list and we will do our own flushing further down.
 	 */
 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+		ticket.orig_bytes = orig_bytes;
 		ticket.bytes = orig_bytes;
 		ticket.error = 0;
 		init_waitqueue_head(&ticket.wait);
@@ -5330,25 +5339,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 		return ret;
 
 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		return wait_reserve_ticket(fs_info, space_info, &ticket,
-					   orig_bytes);
+		return wait_reserve_ticket(fs_info, space_info, &ticket);
 
 	ret = 0;
 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
 	spin_lock(&space_info->lock);
 	if (ticket.bytes) {
-		if (ticket.bytes < orig_bytes) {
-			u64 num_bytes = orig_bytes - ticket.bytes;
-			space_info->bytes_may_use -= num_bytes;
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      space_info->flags,
-						      num_bytes, 0);
-
-		}
+		if (ticket.bytes < orig_bytes)
+			reclaim_bytes = orig_bytes - ticket.bytes;
 		list_del_init(&ticket.list);
 		ret = -ENOSPC;
 	}
 	spin_unlock(&space_info->lock);
+
+	if (reclaim_bytes)
+		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
 	ASSERT(list_empty(&ticket.list));
 	return ret;
 }
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 15/22] btrfs: run delayed iputs before committing
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (12 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 14/22] btrfs: don't enospc all tickets on flush failure Josef Bacik
@ 2018-07-19 14:49 ` Josef Bacik
  2018-08-21 13:51   ` David Sterba
  2018-07-19 14:50 ` [PATCH 16/22] btrfs: loop in inode_rsv_refill Josef Bacik
                   ` (10 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:49 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We want to have a complete picture of any delayed inode updates before
we make the decision to commit or not, so make sure we run delayed iputs
before making the decision to commit or not.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 355884445f38..81396bac53f1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4923,6 +4923,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 		goto commit;
 	}
 
+	mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
+	btrfs_run_delayed_iputs(fs_info);
+	mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+
 	spin_lock(&delayed_rsv->lock);
 	reclaim_bytes += delayed_rsv->reserved;
 	spin_unlock(&delayed_rsv->lock);
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 16/22] btrfs: loop in inode_rsv_refill
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (13 preceding siblings ...)
  2018-07-19 14:49 ` [PATCH 15/22] btrfs: run delayed iputs before committing Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 14:50 ` [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path Josef Bacik
                   ` (9 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

With severe fragmentation we can end up with our inode rsv size being
huge during writeout, which would cause us to need to make very large
metadata reservations.  However we may not actually need that much once
writeout is complete.  So instead try to make our reservation, and if we
couldn't make it re-calculate our new reservation size and try again.
If our reservation size doesn't change between tries then we know we are
actually out of space and can error out.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 81396bac53f1..66b28b29839b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5860,10 +5860,11 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-	u64 num_bytes = 0;
+	u64 num_bytes = 0, last = 0;
 	u64 qgroup_num_bytes = 0;
 	int ret = -ENOSPC;
 
+again:
 	spin_lock(&block_rsv->lock);
 	if (block_rsv->reserved < block_rsv->size)
 		num_bytes = block_rsv->size - block_rsv->reserved;
@@ -5888,8 +5889,22 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 		spin_lock(&block_rsv->lock);
 		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
 		spin_unlock(&block_rsv->lock);
-	} else
+	} else {
 		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+
+		/*
+		 * If we are fragmented we can end up with a lot of outstanding
+		 * extents which will make our size be much larger than our
+		 * reserved amount.  If we happen to try to do a reservation
+		 * here that may result in us trying to do a pretty hefty
+		 * reservation, which we may not need once delalloc flushing
+		 * happens.  If this is the case try and do the reserve again.
+		 */
+		if (flush == BTRFS_RESERVE_FLUSH_ALL && last != num_bytes) {
+			last = num_bytes;
+			goto again;
+		}
+	}
 	return ret;
 }
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (14 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 16/22] btrfs: loop in inode_rsv_refill Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 15:12   ` Filipe Manana
  2018-07-19 15:21   ` Filipe Manana
  2018-07-19 14:50 ` [PATCH 18/22] btrfs: add ALLOC_CHUNK_FORCE to the flushing code Josef Bacik
                   ` (8 subsequent siblings)
  24 siblings, 2 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Since we've changed the fsync() path to always run ordered extents
before doing the tree log we no longer need to take the dio_sem in the
tree log path.  This gets rid of a lockdep splat that I was seeing with
the AIO tests.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/tree-log.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f8220ec02036..aa06e1954b84 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4439,7 +4439,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 	INIT_LIST_HEAD(&extents);
 
-	down_write(&inode->dio_sem);
 	write_lock(&tree->lock);
 	test_gen = root->fs_info->last_trans_committed;
 	logged_start = start;
@@ -4520,7 +4519,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 	WARN_ON(!list_empty(&extents));
 	write_unlock(&tree->lock);
-	up_write(&inode->dio_sem);
 
 	btrfs_release_path(path);
 	if (!ret)
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 18/22] btrfs: add ALLOC_CHUNK_FORCE to the flushing code
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (15 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 14:50 ` [PATCH 19/22] btrfs: set max_extent_size properly Josef Bacik
                   ` (7 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

With my change to no longer take into account the global reserve for
metadata allocation chunks we have this side-effect for mixed block
group fs'es where we are no longer allocating enough chunks for the
data/metadata requirements.  To deal with this add a ALLOC_CHUNK_FORCE
step to the flushing state machine.  This will only get used if we've
already made a full loop through the flushing machinery and tried
committing the transaction.  If we have then we can try and force a
chunk allocation since we likely need it to make progress.  This
resolves the issues I was seeing with the mixed bg tests in xfstests
with my previous patch.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/ctree.h             | 3 ++-
 fs/btrfs/extent-tree.c       | 7 ++++++-
 include/trace/events/btrfs.h | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b33f851fb97d..95f841d8b27c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2756,7 +2756,8 @@ enum btrfs_flush_state {
 	FLUSH_DELALLOC		=	5,
 	FLUSH_DELALLOC_WAIT	=	6,
 	ALLOC_CHUNK		=	7,
-	COMMIT_TRANS		=	8,
+	ALLOC_CHUNK_FORCE	=	8,
+	COMMIT_TRANS		=	9,
 };
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 66b28b29839b..626fb6a92dda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5004,6 +5004,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		btrfs_end_transaction(trans);
 		break;
 	case ALLOC_CHUNK:
+	case ALLOC_CHUNK_FORCE:
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
@@ -5011,7 +5012,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		}
 		ret = do_chunk_alloc(trans, fs_info,
 				     btrfs_metadata_alloc_profile(fs_info),
-				     CHUNK_ALLOC_NO_FORCE);
+				     (state == ALLOC_CHUNK) ?
+				     CHUNK_ALLOC_NO_FORCE :
+				     CHUNK_ALLOC_FORCE);
 		btrfs_end_transaction(trans);
 		if (ret > 0 || ret == -ENOSPC)
 			ret = 0;
@@ -5164,6 +5167,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 			}
 		}
 		spin_unlock(&space_info->lock);
+		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+			flush_state++;
 	} while (flush_state <= COMMIT_TRANS);
 }
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c0c536d8848a..de38e849f963 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1052,6 +1052,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 		{ FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR"},	\
 		{ FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS"},		\
 		{ ALLOC_CHUNK,			"ALLOC_CHUNK"},			\
+		{ ALLOC_CHUNK_FORCE,		"ALLOC_CHUNK_FORCE"},		\
 		{ COMMIT_TRANS,			"COMMIT_TRANS"})
 
 TRACE_EVENT(btrfs_flush_space,
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 19/22] btrfs: set max_extent_size properly
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (16 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 18/22] btrfs: add ALLOC_CHUNK_FORCE to the flushing code Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 14:50 ` [PATCH 20/22] btrfs: don't use ctl->free_space for max_extent_size Josef Bacik
                   ` (6 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We can't use entry->bytes if our entry is a bitmap entry, we need to use
entry->max_extent_size in that case.  Fix up all the logic to make this
consistent.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/free-space-cache.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 13bc514e4e16..92e1c30356b9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1774,6 +1774,18 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	return -1;
 }
 
+static void set_max_extent_size(struct btrfs_free_space *entry,
+				u64 *max_extent_size)
+{
+	if (entry->bitmap) {
+		if (entry->max_extent_size > *max_extent_size)
+			*max_extent_size = entry->max_extent_size;
+	} else {
+		if (entry->bytes > *max_extent_size)
+			*max_extent_size = entry->bytes;
+	}
+}
+
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1795,8 +1807,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 	for (node = &entry->offset_index; node; node = rb_next(node)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		if (entry->bytes < *bytes) {
-			if (entry->bytes > *max_extent_size)
-				*max_extent_size = entry->bytes;
+			set_max_extent_size(entry, max_extent_size);
 			continue;
 		}
 
@@ -1814,8 +1825,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		}
 
 		if (entry->bytes < *bytes + align_off) {
-			if (entry->bytes > *max_extent_size)
-				*max_extent_size = entry->bytes;
+			set_max_extent_size(entry, max_extent_size);
 			continue;
 		}
 
@@ -1827,8 +1837,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 				*offset = tmp;
 				*bytes = size;
 				return entry;
-			} else if (size > *max_extent_size) {
-				*max_extent_size = size;
+			} else {
+				set_max_extent_size(entry, max_extent_size);
 			}
 			continue;
 		}
@@ -2688,8 +2698,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 
 	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
 	if (err) {
-		if (search_bytes > *max_extent_size)
-			*max_extent_size = search_bytes;
+		set_max_extent_size(entry, max_extent_size);
 		return 0;
 	}
 
@@ -2726,8 +2735,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
 	while (1) {
-		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
-			*max_extent_size = entry->bytes;
+		if (entry->bytes < bytes)
+			set_max_extent_size(entry, max_extent_size);
 
 		if (entry->bytes < bytes ||
 		    (!entry->bitmap && entry->offset < min_start)) {
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 20/22] btrfs: don't use ctl->free_space for max_extent_size
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (17 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 19/22] btrfs: set max_extent_size properly Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 14:50 ` [PATCH 21/22] btrfs: reset max_extent_size on clear in a bitmap Josef Bacik
                   ` (5 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

max_extent_size is supposed to be the largest contiguous range for the
space info, and ctl->free_space is the total free space in the block
group.  We need to keep track of these separately and _only_ use the
max_free_space if we don't have a max_extent_size, as that means our
original request was too large to search any of the block groups for and
therefore wouldn't have a max_extent_size set.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent-tree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 626fb6a92dda..7311a232e45d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7598,6 +7598,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_group_cache *block_group = NULL;
 	u64 search_start = 0;
 	u64 max_extent_size = 0;
+	u64 max_free_space = 0;
 	u64 empty_cluster = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
@@ -7893,8 +7894,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 			spin_lock(&ctl->tree_lock);
 			if (ctl->free_space <
 			    num_bytes + empty_cluster + empty_size) {
-				if (ctl->free_space > max_extent_size)
-					max_extent_size = ctl->free_space;
+				max_free_space = max(max_free_space,
+						     ctl->free_space);
 				spin_unlock(&ctl->tree_lock);
 				goto loop;
 			}
@@ -8063,6 +8064,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 	}
 out:
 	if (ret == -ENOSPC) {
+		if (!max_extent_size)
+			max_extent_size = max_free_space;
 		spin_lock(&space_info->lock);
 		space_info->max_extent_size = max_extent_size;
 		spin_unlock(&space_info->lock);
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 21/22] btrfs: reset max_extent_size on clear in a bitmap
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (18 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 20/22] btrfs: don't use ctl->free_space for max_extent_size Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-19 14:50 ` [PATCH 22/22] btrfs: only run delayed refs if we're committing Josef Bacik
                   ` (4 subsequent siblings)
  24 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

We need to clear the max_extent_size when we clear bits from a bitmap
since it could have been from the range that contains the
max_extent_size.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 92e1c30356b9..0feb735a78a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1691,6 +1691,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 	bitmap_clear(info->bitmap, start, count);
 
 	info->bytes -= bytes;
+	if (info->max_extent_size > ctl->unit)
+		info->max_extent_size = 0;
 }
 
 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 22/22] btrfs: only run delayed refs if we're committing
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (19 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 21/22] btrfs: reset max_extent_size on clear in a bitmap Josef Bacik
@ 2018-07-19 14:50 ` Josef Bacik
  2018-07-20  7:37   ` Nikolay Borisov
  2018-07-19 16:08 ` [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper David Sterba
                   ` (3 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 14:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

I noticed in a giant dbench run that we spent a lot of time on lock
contention while running transaction commit.  This is because dbench
results in a lot of fsync()'s that do a btrfs_transaction_commit(), and
they all run the delayed refs first thing, so they all contend with
each other.  This leads to seconds of 0 throughput.  Change this to only
run the delayed refs if we're the ones committing the transaction.  This
makes the latency go away and we get no more lock contention.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/transaction.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4b171d8a7554..39ff9378b3db 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1919,15 +1919,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
-	/* make a pass through all the delayed refs we have so far
-	 * any runnings procs may add more while we are here
-	 */
-	ret = btrfs_run_delayed_refs(trans, 0);
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ret;
-	}
-
 	cur_trans = trans->transaction;
 
 	/*
@@ -1940,12 +1931,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans);
 
-	ret = btrfs_run_delayed_refs(trans, 0);
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ret;
-	}
-
 	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
 		int run_it = 0;
 
@@ -2016,6 +2001,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		spin_unlock(&fs_info->trans_lock);
 	}
 
+	/*
+	 * We are now the only one in the commit area, we can run delayed refs
+	 * without hitting a bunch of lock contention from a lot of people
+	 * trying to commit the transaction at once.
+	 */
+	ret = btrfs_run_delayed_refs(trans, 0);
+	if (ret) {
+		btrfs_end_transaction(trans);
+		return ret;
+	}
+
 	extwriter_counter_dec(cur_trans, trans->type);
 
 	ret = btrfs_start_delalloc_flush(fs_info);
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path
  2018-07-19 14:50 ` [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path Josef Bacik
@ 2018-07-19 15:12   ` Filipe Manana
  2018-07-19 15:21   ` Filipe Manana
  1 sibling, 0 replies; 44+ messages in thread
From: Filipe Manana @ 2018-07-19 15:12 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 3:50 PM, Josef Bacik <josef@toxicpanda.com> wrote:
> Since we've changed the fsync() path to always run ordered extents
> before doing the tree log we no longer need to take the dio_sem in the
> tree log path.  This gets rid of a lockdep splat that I was seeing with
> the AIO tests.

The dio_sem can be removed completely, it was added to synchronize
lockless dio writes with fsync to avoid races.
I.e., just removing it here in the tree-log makes it useless.

thanks!

>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/tree-log.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index f8220ec02036..aa06e1954b84 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -4439,7 +4439,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>
>         INIT_LIST_HEAD(&extents);
>
> -       down_write(&inode->dio_sem);
>         write_lock(&tree->lock);
>         test_gen = root->fs_info->last_trans_committed;
>         logged_start = start;
> @@ -4520,7 +4519,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>         }
>         WARN_ON(!list_empty(&extents));
>         write_unlock(&tree->lock);
> -       up_write(&inode->dio_sem);
>
>         btrfs_release_path(path);
>         if (!ret)
> --
> 2.14.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path
  2018-07-19 14:50 ` [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path Josef Bacik
  2018-07-19 15:12   ` Filipe Manana
@ 2018-07-19 15:21   ` Filipe Manana
  2018-07-19 15:54     ` Josef Bacik
  1 sibling, 1 reply; 44+ messages in thread
From: Filipe Manana @ 2018-07-19 15:21 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 3:50 PM, Josef Bacik <josef@toxicpanda.com> wrote:
> Since we've changed the fsync() path to always run ordered extents
> before doing the tree log we no longer need to take the dio_sem in the
> tree log path.  This gets rid of a lockdep splat that I was seeing with
> the AIO tests.

So actually, we still need it (or some other means of sync).
Because even after the recent changes to fsync, the fast path still
logs extent items based on the extent maps, and the dio write path
creates first the extent map and then the ordered extent.
So the old problem can still happen between concurrent fsync and
lockless dio write, where fsync logs an extent item for an extent map
whose ordered extent we never waited for.
The solution prior to the introduction of dio_sem solved this - make
the dio write create first the ordered extent, and, only after it,
create the extent map.

>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/tree-log.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index f8220ec02036..aa06e1954b84 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -4439,7 +4439,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>
>         INIT_LIST_HEAD(&extents);
>
> -       down_write(&inode->dio_sem);
>         write_lock(&tree->lock);
>         test_gen = root->fs_info->last_trans_committed;
>         logged_start = start;
> @@ -4520,7 +4519,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>         }
>         WARN_ON(!list_empty(&extents));
>         write_unlock(&tree->lock);
> -       up_write(&inode->dio_sem);
>
>         btrfs_release_path(path);
>         if (!ret)
> --
> 2.14.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS
  2018-07-19 14:49 ` [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS Josef Bacik
@ 2018-07-19 15:35   ` Nikolay Borisov
  2018-07-19 15:44     ` David Sterba
  2018-07-19 15:55     ` Josef Bacik
  0 siblings, 2 replies; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-19 15:35 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team



On 19.07.2018 17:49, Josef Bacik wrote:
> If we're allocating a new space cache inode it's likely going to be
> under a transaction handle, so we need to use GFP_NOFS to keep from
> deadlocking.  Otherwise GFP_KERNEL is fine.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/free-space-cache.c | 5 +++++
>  fs/btrfs/inode.c            | 3 ++-
>  fs/btrfs/transaction.h      | 1 +
>  3 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index d5f80cb300be..13bc514e4e16 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -68,7 +68,12 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
>  	btrfs_disk_key_to_cpu(&location, &disk_key);
>  	btrfs_release_path(path);
>  
> +	/* We need this set so that we use GFP_NOFS when allocating our inode. */
> +	if (current->journal_info == NULL)
> +		current->journal_info = BTRFS_TRANS_STUB;
>  	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
> +	if (current->journal_info == BTRFS_TRANS_STUB)
> +		current->journal_info = NULL;
This is not safe in the face of stacked filesystem, i.e ext4 uses the
journal_info.

>  	if (IS_ERR(inode))
>  		return inode;
>  	if (is_bad_inode(inode)) {
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index eba61bcb9bb3..14ecfe5d6110 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9211,8 +9211,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>  	struct btrfs_inode *ei;
>  	struct inode *inode;
> +	gfp_t flags = (current->journal_info) ? GFP_NOFS : GFP_KERNEL;
>  
> -	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
> +	ei = kmem_cache_alloc(btrfs_inode_cachep, flags);

Why don't you just hardcode GFP_NOFS? We should be striving at removing
abuse of ->journal_info than proliferating it.

>  	if (!ei)
>  		return NULL;
>  
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index 94439482a0ec..172ff923bf15 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -106,6 +106,7 @@ struct btrfs_transaction {
>  #define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
>  
>  #define BTRFS_SEND_TRANS_STUB	((void *)1)
> +#define BTRFS_TRANS_STUB	((void *)2)
>  
>  struct btrfs_trans_handle {
>  	u64 transid;
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 08/22] btrfs: dump block_rsv whe dumping space info
  2018-07-19 14:49 ` [PATCH 08/22] btrfs: dump block_rsv whe dumping space info Josef Bacik
@ 2018-07-19 15:39   ` Nikolay Borisov
  2018-07-20 11:59   ` David Sterba
  1 sibling, 0 replies; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-19 15:39 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team



On 19.07.2018 17:49, Josef Bacik wrote:
> For enospc_debug having the block rsvs is super helpful to see if we've
> done something wrong.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/extent-tree.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 6de9a180abdd..35c1c1b7ba1d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -8050,6 +8050,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> +static void dump_block_rsv(struct btrfs_block_rsv *rsv)
> +{
> +	spin_lock(&rsv->lock);
> +	printk(KERN_ERR "%d: size %llu reserved %llu\n",
> +	       rsv->type, (unsigned long long)rsv->size,
> +	       (unsigned long long)rsv->reserved);

Why do you cast to unsigned long long given that size/reserved are
already defined as u64? Also use pr_err rather than printk.

> +	spin_unlock(&rsv->lock);
> +}
> +
>  static void dump_space_info(struct btrfs_fs_info *fs_info,
>  			    struct btrfs_space_info *info, u64 bytes,
>  			    int dump_block_groups)
> @@ -8069,6 +8078,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
>  		info->bytes_readonly);
>  	spin_unlock(&info->lock);
>  
> +	dump_block_rsv(&fs_info->global_block_rsv);
> +	dump_block_rsv(&fs_info->trans_block_rsv);
> +	dump_block_rsv(&fs_info->chunk_block_rsv);
> +	dump_block_rsv(&fs_info->delayed_block_rsv);
> +	dump_block_rsv(&fs_info->delayed_refs_rsv);
> +
>  	if (!dump_block_groups)
>  		return;
>  
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc
  2018-07-19 14:49 ` [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc Josef Bacik
@ 2018-07-19 15:43   ` Nikolay Borisov
  2018-07-24 11:24     ` David Sterba
  0 siblings, 1 reply; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-19 15:43 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team



On 19.07.2018 17:49, Josef Bacik wrote:
> If we're trying to make a data reservation and we have to allocate a
> data chunk we could leak ret == 1, as do_chunk_alloc() will return 1 if
> it allocated a chunk.  Since the end of the function is the success path
> just return 0.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Reviewed-by: Nikolay Borisov <nborisov@suse.com>

The logic flow in this function is a steaming pile of turd and is in
dire need of refactoring...
> ---
>  fs/btrfs/extent-tree.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 523bc197c40b..6de9a180abdd 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4360,7 +4360,7 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
>  				      data_sinfo->flags, bytes, 1);
>  	spin_unlock(&data_sinfo->lock);
>  
> -	return ret;
> +	return 0;
>  }
>  
>  int btrfs_check_data_free_space(struct inode *inode,
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS
  2018-07-19 15:35   ` Nikolay Borisov
@ 2018-07-19 15:44     ` David Sterba
  2018-07-19 15:56       ` Josef Bacik
  2018-07-19 15:55     ` Josef Bacik
  1 sibling, 1 reply; 44+ messages in thread
From: David Sterba @ 2018-07-19 15:44 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 06:35:33PM +0300, Nikolay Borisov wrote:
> 
> 
> On 19.07.2018 17:49, Josef Bacik wrote:
> > If we're allocating a new space cache inode it's likely going to be
> > under a transaction handle, so we need to use GFP_NOFS to keep from
> > deadlocking.  Otherwise GFP_KERNEL is fine.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/btrfs/free-space-cache.c | 5 +++++
> >  fs/btrfs/inode.c            | 3 ++-
> >  fs/btrfs/transaction.h      | 1 +
> >  3 files changed, 8 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > index d5f80cb300be..13bc514e4e16 100644
> > --- a/fs/btrfs/free-space-cache.c
> > +++ b/fs/btrfs/free-space-cache.c
> > @@ -68,7 +68,12 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
> >  	btrfs_disk_key_to_cpu(&location, &disk_key);
> >  	btrfs_release_path(path);
> >  
> > +	/* We need this set so that we use GFP_NOFS when allocating our inode. */
> > +	if (current->journal_info == NULL)
> > +		current->journal_info = BTRFS_TRANS_STUB;
> >  	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
> > +	if (current->journal_info == BTRFS_TRANS_STUB)
> > +		current->journal_info = NULL;
> This is not safe in the face of stacked filesystem, i.e ext4 uses the
> journal_info.
> 
> >  	if (IS_ERR(inode))
> >  		return inode;
> >  	if (is_bad_inode(inode)) {
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index eba61bcb9bb3..14ecfe5d6110 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -9211,8 +9211,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >  	struct btrfs_inode *ei;
> >  	struct inode *inode;
> > +	gfp_t flags = (current->journal_info) ? GFP_NOFS : GFP_KERNEL;
> >  
> > -	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
> > +	ei = kmem_cache_alloc(btrfs_inode_cachep, flags);
> 
> Why don't you just hardcode GFP_NOFS? We should be striving at removing
> abuse of ->journal_info than proliferating it.

We're also removing unnecessary use of GFP_NOFS. The right way here is
to use memalloc_nofs_save/memalloc_nofs_restore around btrfs_iget.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path
  2018-07-19 15:21   ` Filipe Manana
@ 2018-07-19 15:54     ` Josef Bacik
  2018-07-19 15:57       ` Filipe Manana
  0 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 15:54 UTC (permalink / raw)
  To: Filipe Manana; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 04:21:58PM +0100, Filipe Manana wrote:
> On Thu, Jul 19, 2018 at 3:50 PM, Josef Bacik <josef@toxicpanda.com> wrote:
> > Since we've changed the fsync() path to always run ordered extents
> > before doing the tree log we no longer need to take the dio_sem in the
> > tree log path.  This gets rid of a lockdep splat that I was seeing with
> > the AIO tests.
> 
> So actually, we still need it (or some other means of sync).
> Because even after the recent changes to fsync, the fast path still
> logs extent items based on the extent maps, and the dio write path
> creates first the extent map and then the ordered extent.
> So the old problem can still happen between concurrent fsync and
> lockless dio write, where fsync logs an extent item for an extent map
> whose ordered extent we never waited for.
> The solution prior to the introduction of dio_sem solved this - make
> the dio write create first the ordered extent, and, only after it,
> create the extent map.
> 

Oooh balls I see.  This is still a problem even if we add the ordered extent
first, because we can easily just start the lockless dio write after we've
waited for ordered extents, so the order we create the extent map and ordered
extent don't actually matter.  We still have this lockdep thing, I think we just
move the dio_sem to the start of fsync, if we're fsyncing you just don't get to
do lockless dio writes while we're doing the fsync.  What do you think?  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS
  2018-07-19 15:35   ` Nikolay Borisov
  2018-07-19 15:44     ` David Sterba
@ 2018-07-19 15:55     ` Josef Bacik
  1 sibling, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 15:55 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 06:35:33PM +0300, Nikolay Borisov wrote:
> 
> 
> On 19.07.2018 17:49, Josef Bacik wrote:
> > If we're allocating a new space cache inode it's likely going to be
> > under a transaction handle, so we need to use GFP_NOFS to keep from
> > deadlocking.  Otherwise GFP_KERNEL is fine.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/btrfs/free-space-cache.c | 5 +++++
> >  fs/btrfs/inode.c            | 3 ++-
> >  fs/btrfs/transaction.h      | 1 +
> >  3 files changed, 8 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > index d5f80cb300be..13bc514e4e16 100644
> > --- a/fs/btrfs/free-space-cache.c
> > +++ b/fs/btrfs/free-space-cache.c
> > @@ -68,7 +68,12 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
> >  	btrfs_disk_key_to_cpu(&location, &disk_key);
> >  	btrfs_release_path(path);
> >  
> > +	/* We need this set so that we use GFP_NOFS when allocating our inode. */
> > +	if (current->journal_info == NULL)
> > +		current->journal_info = BTRFS_TRANS_STUB;
> >  	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
> > +	if (current->journal_info == BTRFS_TRANS_STUB)
> > +		current->journal_info = NULL;
> This is not safe in the face of stacked filesystem, i.e ext4 uses the
> journal_info.

Stacked file systems arent safe at all because we all use journal_info, this
doesn't change that.

> 
> >  	if (IS_ERR(inode))
> >  		return inode;
> >  	if (is_bad_inode(inode)) {
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index eba61bcb9bb3..14ecfe5d6110 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -9211,8 +9211,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >  	struct btrfs_inode *ei;
> >  	struct inode *inode;
> > +	gfp_t flags = (current->journal_info) ? GFP_NOFS : GFP_KERNEL;
> >  
> > -	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
> > +	ei = kmem_cache_alloc(btrfs_inode_cachep, flags);
> 
> Why don't you just hardcode GFP_NOFS? We should be striving at removing
> abuse of ->journal_info than proliferating it.
> 

Because every time I use GFP_NOFS some mm guy shows up and complains.  I'm fine
with doing GFP_NOFS, but the vast majority of allocations are in paths that
GFP_KERNEL is fine.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS
  2018-07-19 15:44     ` David Sterba
@ 2018-07-19 15:56       ` Josef Bacik
  0 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 15:56 UTC (permalink / raw)
  To: dsterba, Nikolay Borisov, Josef Bacik, linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 05:44:53PM +0200, David Sterba wrote:
> On Thu, Jul 19, 2018 at 06:35:33PM +0300, Nikolay Borisov wrote:
> > 
> > 
> > On 19.07.2018 17:49, Josef Bacik wrote:
> > > If we're allocating a new space cache inode it's likely going to be
> > > under a transaction handle, so we need to use GFP_NOFS to keep from
> > > deadlocking.  Otherwise GFP_KERNEL is fine.
> > > 
> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > > ---
> > >  fs/btrfs/free-space-cache.c | 5 +++++
> > >  fs/btrfs/inode.c            | 3 ++-
> > >  fs/btrfs/transaction.h      | 1 +
> > >  3 files changed, 8 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > > index d5f80cb300be..13bc514e4e16 100644
> > > --- a/fs/btrfs/free-space-cache.c
> > > +++ b/fs/btrfs/free-space-cache.c
> > > @@ -68,7 +68,12 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
> > >  	btrfs_disk_key_to_cpu(&location, &disk_key);
> > >  	btrfs_release_path(path);
> > >  
> > > +	/* We need this set so that we use GFP_NOFS when allocating our inode. */
> > > +	if (current->journal_info == NULL)
> > > +		current->journal_info = BTRFS_TRANS_STUB;
> > >  	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
> > > +	if (current->journal_info == BTRFS_TRANS_STUB)
> > > +		current->journal_info = NULL;
> > This is not safe in the face of stacked filesystem, i.e ext4 uses the
> > journal_info.
> > 
> > >  	if (IS_ERR(inode))
> > >  		return inode;
> > >  	if (is_bad_inode(inode)) {
> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > index eba61bcb9bb3..14ecfe5d6110 100644
> > > --- a/fs/btrfs/inode.c
> > > +++ b/fs/btrfs/inode.c
> > > @@ -9211,8 +9211,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
> > >  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> > >  	struct btrfs_inode *ei;
> > >  	struct inode *inode;
> > > +	gfp_t flags = (current->journal_info) ? GFP_NOFS : GFP_KERNEL;
> > >  
> > > -	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
> > > +	ei = kmem_cache_alloc(btrfs_inode_cachep, flags);
> > 
> > Why don't you just hardcode GFP_NOFS? We should be striving at removing
> > abuse of ->journal_info than proliferating it.
> 
> We're also removing unnecessary use of GFP_NOFS. The right way here is
> to use memalloc_nofs_save/memalloc_nofs_restore around btrfs_iget.

Huh I didn't know this existed, that's neat.  I'll change this patch to do that,
thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path
  2018-07-19 15:54     ` Josef Bacik
@ 2018-07-19 15:57       ` Filipe Manana
  0 siblings, 0 replies; 44+ messages in thread
From: Filipe Manana @ 2018-07-19 15:57 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 4:54 PM, Josef Bacik <josef@toxicpanda.com> wrote:
> On Thu, Jul 19, 2018 at 04:21:58PM +0100, Filipe Manana wrote:
>> On Thu, Jul 19, 2018 at 3:50 PM, Josef Bacik <josef@toxicpanda.com> wrote:
>> > Since we've changed the fsync() path to always run ordered extents
>> > before doing the tree log we no longer need to take the dio_sem in the
>> > tree log path.  This gets rid of a lockdep splat that I was seeing with
>> > the AIO tests.
>>
>> So actually, we still need it (or some other means of sync).
>> Because even after the recent changes to fsync, the fast path still
>> logs extent items based on the extent maps, and the dio write path
>> creates first the extent map and then the ordered extent.
>> So the old problem can still happen between concurrent fsync and
>> lockless dio write, where fsync logs an extent item for an extent map
>> whose ordered extent we never waited for.
>> The solution prior to the introduction of dio_sem solved this - make
>> the dio write create first the ordered extent, and, only after it,
>> create the extent map.
>>
>
> Oooh balls I see.  This is still a problem even if we add the ordered extent
> first, because we can easily just start the lockless dio write after we've
> waited for ordered extents, so the order we create the extent map and ordered
> extent don't actually matter.  We still have this lockdep thing, I think we just
> move the dio_sem to the start of fsync, if we're fsyncing you just don't get to
> do lockless dio writes while we're doing the fsync.  What do you think?  Thanks,

On first though, it seems to be the simplest and sanest solution :)
Thanks.

>
> Josef



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (20 preceding siblings ...)
  2018-07-19 14:50 ` [PATCH 22/22] btrfs: only run delayed refs if we're committing Josef Bacik
@ 2018-07-19 16:08 ` David Sterba
  2018-07-19 16:12   ` Josef Bacik
  2018-07-20 13:11 ` Nikolay Borisov
                   ` (2 subsequent siblings)
  24 siblings, 1 reply; 44+ messages in thread
From: David Sterba @ 2018-07-19 16:08 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team, Josef Bacik

Please send cover letter for patchsets with more than 1 patch.
Especially for a patchset with 20+ patches that does not seem to be
doing anything trivial.

What problem does this fix, what's the overall idea behind the approach,
what workload can trigger that etc.

Thanks.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 16:08 ` [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper David Sterba
@ 2018-07-19 16:12   ` Josef Bacik
  2018-07-19 16:36     ` David Sterba
  0 siblings, 1 reply; 44+ messages in thread
From: Josef Bacik @ 2018-07-19 16:12 UTC (permalink / raw)
  To: David Sterba; +Cc: Josef Bacik, linux-btrfs, kernel-team, Josef Bacik

On Thu, Jul 19, 2018 at 06:08:21PM +0200, David Sterba wrote:
> Please send cover letter for patchsets with more than 1 patch.
> Especially for a patchset with 20+ patches that does not seem to be
> doing anything trivial.
> 
> What problem does this fix, what's the overall idea behind the approach,
> what workload can trigger that etc.

They are just fixes, there's no large over-arching theme, hence no cover letter,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 16:12   ` Josef Bacik
@ 2018-07-19 16:36     ` David Sterba
  0 siblings, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-07-19 16:36 UTC (permalink / raw)
  To: Josef Bacik; +Cc: David Sterba, linux-btrfs, kernel-team, Josef Bacik

On Thu, Jul 19, 2018 at 12:12:52PM -0400, Josef Bacik wrote:
> On Thu, Jul 19, 2018 at 06:08:21PM +0200, David Sterba wrote:
> > Please send cover letter for patchsets with more than 1 patch.
> > Especially for a patchset with 20+ patches that does not seem to be
> > doing anything trivial.
> > 
> > What problem does this fix, what's the overall idea behind the approach,
> > what workload can trigger that etc.
> 
> They are just fixes, there's no large over-arching theme, hence no cover letter,

Cover letter is a single point where I can address the whole patchset,
like that it's added to for-next or merged, or report bugs. Also you are
going to write what are the changes between the revisions.

The initial text does not need to be long if there's really nothing big
behind, but please state that in advance. This sets the expectations
before one is going through the patches.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 22/22] btrfs: only run delayed refs if we're committing
  2018-07-19 14:50 ` [PATCH 22/22] btrfs: only run delayed refs if we're committing Josef Bacik
@ 2018-07-20  7:37   ` Nikolay Borisov
  0 siblings, 0 replies; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-20  7:37 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team



On 19.07.2018 17:50, Josef Bacik wrote:
> I noticed in a giant dbench run that we spent a lot of time on lock
> contention while running transaction commit.  This is because dbench
> results in a lot of fsync()'s that do a btrfs_transaction_commit(), and
> they all run the delayed refs first thing, so they all contend with
> each other.  This leads to seconds of 0 throughput.  Change this to only
> run the delayed refs if we're the ones committing the transaction.  This
> makes the latency go away and we get no more lock contention.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/transaction.c | 26 +++++++++++---------------
>  1 file changed, 11 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 4b171d8a7554..39ff9378b3db 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -1919,15 +1919,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>  	btrfs_trans_release_metadata(trans);
>  	trans->block_rsv = NULL;
>  
> -	/* make a pass through all the delayed refs we have so far
> -	 * any runnings procs may add more while we are here
> -	 */
> -	ret = btrfs_run_delayed_refs(trans, 0);
> -	if (ret) {
> -		btrfs_end_transaction(trans);
> -		return ret;
> -	}
> -
>  	cur_trans = trans->transaction;
>  
>  	/*
> @@ -1940,12 +1931,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>  	if (!list_empty(&trans->new_bgs))
>  		btrfs_create_pending_block_groups(trans);
>  
> -	ret = btrfs_run_delayed_refs(trans, 0);
> -	if (ret) {
> -		btrfs_end_transaction(trans);
> -		return ret;
> -	}
> -
>  	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
>  		int run_it = 0;
>  
> @@ -2016,6 +2001,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>  		spin_unlock(&fs_info->trans_lock);
>  	}
>  
> +	/*
> +	 * We are now the only one in the commit area, we can run delayed refs
> +	 * without hitting a bunch of lock contention from a lot of people
> +	 * trying to commit the transaction at once.
> +	 */
> +	ret = btrfs_run_delayed_refs(trans, 0);
> +	if (ret) {
> +		btrfs_end_transaction(trans);
> +		return ret;
> +	}
> +

Is this really needed since we already have code which runs delayed refs
in the transaction critical section right after btrfs_run_delayed_items.
I'd rather have a single place in commit_transaction where delayed_refs
are run. If this is needed for correctness reasons then document what
this reason is for otherwise let's just remove it.

>  	extwriter_counter_dec(cur_trans, trans->type);
>  
>  	ret = btrfs_start_delalloc_flush(fs_info);
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 08/22] btrfs: dump block_rsv whe dumping space info
  2018-07-19 14:49 ` [PATCH 08/22] btrfs: dump block_rsv whe dumping space info Josef Bacik
  2018-07-19 15:39   ` Nikolay Borisov
@ 2018-07-20 11:59   ` David Sterba
  1 sibling, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-07-20 11:59 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 10:49:52AM -0400, Josef Bacik wrote:
> For enospc_debug having the block rsvs is super helpful to see if we've
> done something wrong.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/extent-tree.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 6de9a180abdd..35c1c1b7ba1d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -8050,6 +8050,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> +static void dump_block_rsv(struct btrfs_block_rsv *rsv)
> +{
> +	spin_lock(&rsv->lock);
> +	printk(KERN_ERR "%d: size %llu reserved %llu\n",
> +	       rsv->type, (unsigned long long)rsv->size,
> +	       (unsigned long long)rsv->reserved);

As you have fs_info available in the caller, you can pass it and use the
btrfs_err helper that print the filesystem identification.

> +	spin_unlock(&rsv->lock);
> +}
> +
>  static void dump_space_info(struct btrfs_fs_info *fs_info,
>  			    struct btrfs_space_info *info, u64 bytes,
>  			    int dump_block_groups)
> @@ -8069,6 +8078,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
>  		info->bytes_readonly);
>  	spin_unlock(&info->lock);
>  
> +	dump_block_rsv(&fs_info->global_block_rsv);
> +	dump_block_rsv(&fs_info->trans_block_rsv);
> +	dump_block_rsv(&fs_info->chunk_block_rsv);
> +	dump_block_rsv(&fs_info->delayed_block_rsv);
> +	dump_block_rsv(&fs_info->delayed_refs_rsv);
> +
>  	if (!dump_block_groups)
>  		return;
>  
> -- 
> 2.14.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (21 preceding siblings ...)
  2018-07-19 16:08 ` [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper David Sterba
@ 2018-07-20 13:11 ` Nikolay Borisov
  2018-07-20 14:18   ` Josef Bacik
  2018-07-20 14:48 ` David Sterba
  2018-07-20 16:04 ` David Sterba
  24 siblings, 1 reply; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-20 13:11 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team; +Cc: Josef Bacik



On 19.07.2018 17:49, Josef Bacik wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> We do this dance in cleanup_ref_head and check_ref_cleanup, unify it
> into a helper and cleanup the calling functions.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/delayed-ref.c | 14 ++++++++++++++
>  fs/btrfs/delayed-ref.h |  3 ++-
>  fs/btrfs/extent-tree.c | 24 ++++--------------------
>  3 files changed, 20 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 03dec673d12a..e1b322d651dd 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -393,6 +393,20 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
>  	return head;
>  }
>  
> +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
> +			   struct btrfs_delayed_ref_head *head)
> +{
> +	lockdep_assert_held(&delayed_refs->lock);
> +	lockdep_assert_held(&head->lock);
> +
> +	rb_erase(&head->href_node, &delayed_refs->href_root);
> +	RB_CLEAR_NODE(&head->href_node);
> +	atomic_dec(&delayed_refs->num_entries);
> +	delayed_refs->num_heads--;
> +	if (head->processing == 0)
> +		delayed_refs->num_heads_ready--;
> +}
> +
>  /*
>   * Helper to insert the ref_node to the tail or merge with tail.
>   *
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index ea1aecb6a50d..36318182e4ec 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -263,7 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
>  {
>  	mutex_unlock(&head->mutex);
>  }
> -
> +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
> +			   struct btrfs_delayed_ref_head *head);
>  
>  struct btrfs_delayed_ref_head *
>  btrfs_select_ref_head(struct btrfs_trans_handle *trans);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3d9fe58c0080..ccaccd78534e 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2577,12 +2577,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
>  		spin_unlock(&delayed_refs->lock);
>  		return 1;
>  	}
> -	delayed_refs->num_heads--;
> -	rb_erase(&head->href_node, &delayed_refs->href_root);
> -	RB_CLEAR_NODE(&head->href_node);
> -	spin_unlock(&head->lock);
> +	btrfs_delete_ref_head(delayed_refs, head);
>  	spin_unlock(&delayed_refs->lock);
> -	atomic_dec(&delayed_refs->num_entries);
> +	spin_unlock(&head->lock);
>  
>  	trace_run_delayed_ref_head(fs_info, head, 0);
>  
> @@ -7122,22 +7119,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
>  	if (!mutex_trylock(&head->mutex))
>  		goto out;
>  
> -	/*
> -	 * at this point we have a head with no other entries.  Go
> -	 * ahead and process it.
> -	 */
> -	rb_erase(&head->href_node, &delayed_refs->href_root);
> -	RB_CLEAR_NODE(&head->href_node);
> -	atomic_dec(&delayed_refs->num_entries);
> -
> -	/*
> -	 * we don't take a ref on the node because we're removing it from the
> -	 * tree, so we just steal the ref the tree was holding.
> -	 */
> -	delayed_refs->num_heads--;
> -	if (head->processing == 0)
> -		delayed_refs->num_heads_ready--;

In cleanup_ref_head we don't have the num_heads_ready-- code so this is
not pure consolidation but changes the behavior to a certain extent. It
seems this patch is also fixing a bug w.r.t num_heads_ready counts if
so, this needs to be documented in the changelog.

> +	btrfs_delete_ref_head(delayed_refs, head);
>  	head->processing = 0;
> +
>  	spin_unlock(&head->lock);
>  	spin_unlock(&delayed_refs->lock);
>  
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-20 13:11 ` Nikolay Borisov
@ 2018-07-20 14:18   ` Josef Bacik
  0 siblings, 0 replies; 44+ messages in thread
From: Josef Bacik @ 2018-07-20 14:18 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: Josef Bacik, linux-btrfs, kernel-team, Josef Bacik

On Fri, Jul 20, 2018 at 04:11:29PM +0300, Nikolay Borisov wrote:
> 
> 
> On 19.07.2018 17:49, Josef Bacik wrote:
> > From: Josef Bacik <jbacik@fb.com>
> > 
> > We do this dance in cleanup_ref_head and check_ref_cleanup, unify it
> > into a helper and cleanup the calling functions.
> > 
> > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > ---
> >  fs/btrfs/delayed-ref.c | 14 ++++++++++++++
> >  fs/btrfs/delayed-ref.h |  3 ++-
> >  fs/btrfs/extent-tree.c | 24 ++++--------------------
> >  3 files changed, 20 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> > index 03dec673d12a..e1b322d651dd 100644
> > --- a/fs/btrfs/delayed-ref.c
> > +++ b/fs/btrfs/delayed-ref.c
> > @@ -393,6 +393,20 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
> >  	return head;
> >  }
> >  
> > +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
> > +			   struct btrfs_delayed_ref_head *head)
> > +{
> > +	lockdep_assert_held(&delayed_refs->lock);
> > +	lockdep_assert_held(&head->lock);
> > +
> > +	rb_erase(&head->href_node, &delayed_refs->href_root);
> > +	RB_CLEAR_NODE(&head->href_node);
> > +	atomic_dec(&delayed_refs->num_entries);
> > +	delayed_refs->num_heads--;
> > +	if (head->processing == 0)
> > +		delayed_refs->num_heads_ready--;
> > +}
> > +
> >  /*
> >   * Helper to insert the ref_node to the tail or merge with tail.
> >   *
> > diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> > index ea1aecb6a50d..36318182e4ec 100644
> > --- a/fs/btrfs/delayed-ref.h
> > +++ b/fs/btrfs/delayed-ref.h
> > @@ -263,7 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
> >  {
> >  	mutex_unlock(&head->mutex);
> >  }
> > -
> > +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
> > +			   struct btrfs_delayed_ref_head *head);
> >  
> >  struct btrfs_delayed_ref_head *
> >  btrfs_select_ref_head(struct btrfs_trans_handle *trans);
> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index 3d9fe58c0080..ccaccd78534e 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -2577,12 +2577,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
> >  		spin_unlock(&delayed_refs->lock);
> >  		return 1;
> >  	}
> > -	delayed_refs->num_heads--;
> > -	rb_erase(&head->href_node, &delayed_refs->href_root);
> > -	RB_CLEAR_NODE(&head->href_node);
> > -	spin_unlock(&head->lock);
> > +	btrfs_delete_ref_head(delayed_refs, head);
> >  	spin_unlock(&delayed_refs->lock);
> > -	atomic_dec(&delayed_refs->num_entries);
> > +	spin_unlock(&head->lock);
> >  
> >  	trace_run_delayed_ref_head(fs_info, head, 0);
> >  
> > @@ -7122,22 +7119,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
> >  	if (!mutex_trylock(&head->mutex))
> >  		goto out;
> >  
> > -	/*
> > -	 * at this point we have a head with no other entries.  Go
> > -	 * ahead and process it.
> > -	 */
> > -	rb_erase(&head->href_node, &delayed_refs->href_root);
> > -	RB_CLEAR_NODE(&head->href_node);
> > -	atomic_dec(&delayed_refs->num_entries);
> > -
> > -	/*
> > -	 * we don't take a ref on the node because we're removing it from the
> > -	 * tree, so we just steal the ref the tree was holding.
> > -	 */
> > -	delayed_refs->num_heads--;
> > -	if (head->processing == 0)
> > -		delayed_refs->num_heads_ready--;
> 
> In cleanup_ref_head we don't have the num_heads_ready-- code so this is
> not pure consolidation but changes the behavior to a certain extent. It
> seems this patch is also fixing a bug w.r.t num_heads_ready counts if
> so, this needs to be documented in the changelog.
> 

No it's not, because cleanup_ref_head is called when running delayed refs, so
head->processing == 1, which means there's no change.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (22 preceding siblings ...)
  2018-07-20 13:11 ` Nikolay Borisov
@ 2018-07-20 14:48 ` David Sterba
  2018-07-20 16:04 ` David Sterba
  24 siblings, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-07-20 14:48 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team, Josef Bacik

On Thu, Jul 19, 2018 at 10:49:45AM -0400, Josef Bacik wrote:
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2577,12 +2577,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
>  		spin_unlock(&delayed_refs->lock);
>  		return 1;
>  	}
> -	delayed_refs->num_heads--;
> -	rb_erase(&head->href_node, &delayed_refs->href_root);
> -	RB_CLEAR_NODE(&head->href_node);
> -	spin_unlock(&head->lock);
> +	btrfs_delete_ref_head(delayed_refs, head);
>  	spin_unlock(&delayed_refs->lock);
> -	atomic_dec(&delayed_refs->num_entries);
> +	spin_unlock(&head->lock);

The order of unlocks is reversed, the head is nested to the delayed
refs.

> @@ -7122,22 +7119,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
>  	if (!mutex_trylock(&head->mutex))
>  		goto out;
>  
> -	/*
> -	 * at this point we have a head with no other entries.  Go
> -	 * ahead and process it.
> -	 */
> -	rb_erase(&head->href_node, &delayed_refs->href_root);
> -	RB_CLEAR_NODE(&head->href_node);
> -	atomic_dec(&delayed_refs->num_entries);
> -
> -	/*
> -	 * we don't take a ref on the node because we're removing it from the
> -	 * tree, so we just steal the ref the tree was holding.
> -	 */
> -	delayed_refs->num_heads--;
> -	if (head->processing == 0)
> -		delayed_refs->num_heads_ready--;
> +	btrfs_delete_ref_head(delayed_refs, head);
>  	head->processing = 0;
> +
>  	spin_unlock(&head->lock);
>  	spin_unlock(&delayed_refs->lock);

The right order eg. here and everywhere else.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper
  2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
                   ` (23 preceding siblings ...)
  2018-07-20 14:48 ` David Sterba
@ 2018-07-20 16:04 ` David Sterba
  24 siblings, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-07-20 16:04 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team, Josef Bacik

fstests run in the sequence from btrfs/001, in qemu, 2G memory, warning
and crash at btrfs/124

btrfs/124		[22:58:39]
[10100.765898] run fstests btrfs/124 at 2018-07-19 22:58:39
...
[10110.113787] BTRFS: device fsid 8b6b700e-3346-4260-bb2e-c561ba4b9960 devid 1 transid 7 /dev/vdb
[10110.118673] BTRFS info (device vdb): allowing degraded mounts
[10110.120386] BTRFS info (device vdb): disk space caching is enabled
[10110.122070] BTRFS info (device vdb): has skinny extents
[10110.138082] BTRFS warning (device vdb): devid 2 uuid 1e933775-ded8-40fa-9b6a-6127afd4cc01 is missing
[10110.141411] BTRFS warning (device vdb): devid 2 uuid 1e933775-ded8-40fa-9b6a-6127afd4cc01 is missing
[10110.719342] WARNING: CPU: 3 PID: 28973 at fs/btrfs/extent-tree.c:6122 btrfs_trans_release_chunk_metadata+0x3f/0x50 [btrfs]
[10110.723171] Modules linked in: btrfs libcrc32c xor zstd_decompress zstd_compress xxhash raid6_pq dm_mod dax loop [last unloaded: libcrc32c]
[10110.727313] CPU: 3 PID: 28973 Comm: dd Not tainted 4.18.0-rc5-default+ #198
[10110.728504] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
[10110.730236] RIP: 0010:btrfs_trans_release_chunk_metadata+0x3f/0x50 [btrfs]
[10110.734444] RSP: 0018:ffffb3edc08bfba0 EFLAGS: 00010202
[10110.735361] RAX: ffff9b68e5232dd8 RBX: ffff9b69315f98f0 RCX: 00000000000c0000
[10110.736539] RDX: ffff9b69315f9948 RSI: ffff9b68e52369d8 RDI: ffff9b69315f98f0
[10110.737721] RBP: ffff9b68e5236000 R08: 0000000000000001 R09: 0000000000000000
[10110.738849] R10: ffff9b691e93e4e0 R11: 0000000000000000 R12: ffff9b69359b0000
[10110.739966] R13: 0000000000000000 R14: 0000000000000201 R15: ffff9b6917e08528
[10110.741460] FS:  00007f15f5b14540(0000) GS:ffff9b693cc00000(0000) knlGS:0000000000000000
[10110.742795] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10110.743679] CR2: 00007f15f5571690 CR3: 000000002d8a0000 CR4: 00000000000006e0
[10110.744692] Call Trace:
[10110.745096]  __btrfs_end_transaction+0x55/0x230 [btrfs]
[10110.745762]  btrfs_create+0xf2/0x1f0 [btrfs]
[10110.746564]  lookup_open+0x562/0x700
[10110.747216]  ? __wake_up_common_lock+0x63/0xc0
[10110.748021]  ? find_held_lock+0x34/0xa0
[10110.748737]  path_openat+0x653/0xf90
[10110.749411]  do_filp_open+0x93/0x100
[10110.750076]  ? _raw_spin_unlock+0x24/0x40
[10110.750793]  ? __alloc_fd+0xee/0x1d0
[10110.751438]  do_sys_open+0x186/0x210
[10110.752152]  do_syscall_64+0x5a/0x170
[10110.752774]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[10110.753582] RIP: 0033:0x7f15f562090e
[10110.757116] RSP: 002b:00007ffea876ae20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
[10110.758424] RAX: ffffffffffffffda RBX: 000055d6f74323e0 RCX: 00007f15f562090e
[10110.759523] RDX: 0000000000000241 RSI: 00007ffea876c286 RDI: 00000000ffffff9c
[10110.760501] RBP: 0000000000000001 R08: 00007ffea876c2a7 R09: 0000000000000000
[10110.761323] R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000241
[10110.762400] R13: 00007ffea876c286 R14: 0000000000000001 R15: 00007ffea876b118
[10110.763533] irq event stamp: 2212
[10110.764139] hardirqs last  enabled at (2211): [<ffffffff9221b007>] slab_free_freelist_hook+0x97/0xe0
[10110.765566] hardirqs last disabled at (2212): [<ffffffff92800f9c>] error_entry+0x6c/0xc0
[10110.766932] softirqs last  enabled at (2006): [<ffffffff921c485e>] __set_page_dirty_nobuffers+0xfe/0x140
[10110.768464] softirqs last disabled at (2002): [<ffffffff921ddebe>] wb_wakeup_delayed+0x2e/0x70
[10110.769891] ---[ end trace 6fb571b51e7cab5c ]---
[10131.610680] assertion failed: list_empty(&block_group->bg_list), file: fs/btrfs/extent-tree.c, line: 10070
[10131.614113] ------------[ cut here ]------------
[10131.615675] kernel BUG at fs/btrfs/ctree.h:3472!
[10131.617376] invalid opcode: 0000 [#1] PREEMPT SMP
[10131.618968] CPU: 1 PID: 28975 Comm: umount Tainted: G        W         4.18.0-rc5-default+ #198
[10131.621170] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
[10131.622672] RIP: 0010:assfail.constprop.84+0x18/0x1a [btrfs]
[10131.626514] RSP: 0018:ffffb3edc6bfbde0 EFLAGS: 00010246
[10131.627380] RAX: 000000000000005e RBX: ffff9b68e5232c00 RCX: 0000000000000006
[10131.628521] RDX: 0000000000000000 RSI: ffff9b68f9bcd9c8 RDI: 0000000000000246
[10131.629596] RBP: ffff9b69359b0000 R08: 00001016d930fc78 R09: 0000000000000000
[10131.630678] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9b69359b0118
[10131.631754] R13: ffff9b69359b0168 R14: ffff9b68e5232d98 R15: dead000000000100
[10131.632823] FS:  00007fbc00b0dfc0(0000) GS:ffff9b693c800000(0000) knlGS:0000000000000000
[10131.634119] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10131.635030] CR2: 00007fbc006e167e CR3: 0000000053e5c000 CR4: 00000000000006e0
[10131.636136] Call Trace:
[10131.636641]  btrfs_free_block_groups.cold.112+0x22/0x55 [btrfs]
[10131.637583]  close_ctree+0x159/0x330 [btrfs]
[10131.638292]  generic_shutdown_super+0x64/0x100
[10131.639031]  kill_anon_super+0xe/0x20
[10131.639663]  btrfs_kill_super+0x12/0xa0 [btrfs]
[10131.640408]  deactivate_locked_super+0x29/0x60
[10131.641159]  cleanup_mnt+0x3b/0x70
[10131.641764]  task_work_run+0x9b/0xd0
[10131.642406]  exit_to_usermode_loop+0xbd/0xc0
[10131.643124]  do_syscall_64+0x16c/0x170
[10131.643775]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[10131.644590] RIP: 0033:0x7fbc003d4d07
[10131.648070] RSP: 002b:00007fff6a865598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[10131.649331] RAX: 0000000000000000 RBX: 0000557a82414970 RCX: 00007fbc003d4d07
[10131.650417] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000557a82414b50
[10131.651486] RBP: 0000000000000000 R08: 0000557a82414b70 R09: 00007fff6a863e00
[10131.652439] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fbc008f11c4
[10131.653253] R13: 0000557a82414b50 R14: 0000000000000000 R15: 00007fff6a865808
[10131.654063] Modules linked in: btrfs libcrc32c xor zstd_decompress zstd_compress xxhash raid6_pq dm_mod dax loop [last unloaded: libcrc32c]
[10131.655964] ---[ end trace 6fb571b51e7cab5d ]---
[10131.656787] RIP: 0010:assfail.constprop.84+0x18/0x1a [btrfs]
[10131.660454] RSP: 0018:ffffb3edc6bfbde0 EFLAGS: 00010246
[10131.661302] RAX: 000000000000005e RBX: ffff9b68e5232c00 RCX: 0000000000000006
[10131.662353] RDX: 0000000000000000 RSI: ffff9b68f9bcd9c8 RDI: 0000000000000246
[10131.663413] RBP: ffff9b69359b0000 R08: 00001016d930fc78 R09: 0000000000000000
[10131.664508] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9b69359b0118
[10131.665627] R13: ffff9b69359b0168 R14: ffff9b68e5232d98 R15: dead000000000100
[10131.666723] FS:  00007fbc00b0dfc0(0000) GS:ffff9b693c800000(0000) knlGS:0000000000000000
[10131.668029] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10131.668943] CR2: 00007fbc006e167e CR3: 0000000053e5c000 CR4: 00000000000006e0
[10132.375092] BTRFS: device fsid fec9b6db-bc92-4981-85fa-f965b2e44059 devid 1 transid 234 /dev/vda

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates
  2018-07-19 14:49 ` [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates Josef Bacik
@ 2018-07-21  8:49   ` Nikolay Borisov
  0 siblings, 0 replies; 44+ messages in thread
From: Nikolay Borisov @ 2018-07-21  8:49 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs, kernel-team; +Cc: Josef Bacik



On 19.07.2018 17:49, Josef Bacik wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> We use this number to figure out how many delayed refs to run, but
> __btrfs_run_delayed_refs really only checks every time we need a new
> delayed ref head, so we always run at least one ref head completely no
> matter what the number of items on it.  So instead track only the ref
> heads added by this trans handle and adjust the counting appropriately
> in __btrfs_run_delayed_refs.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/delayed-ref.c | 3 ---
>  fs/btrfs/extent-tree.c | 5 +----
>  2 files changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index e1b322d651dd..e5a8fb7ba9f6 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -234,8 +234,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
>  	ref->in_tree = 0;
>  	btrfs_put_delayed_ref(ref);
>  	atomic_dec(&delayed_refs->num_entries);
> -	if (trans->delayed_ref_updates)
> -		trans->delayed_ref_updates--;
>  }
>  
>  static bool merge_ref(struct btrfs_trans_handle *trans,
> @@ -460,7 +458,6 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
>  	if (ref->action == BTRFS_ADD_DELAYED_REF)
>  		list_add_tail(&ref->add_list, &href->ref_add_list);
>  	atomic_inc(&root->num_entries);
> -	trans->delayed_ref_updates++;
>  	spin_unlock(&href->lock);
>  	return ret;
>  }

This seems unrelated to what the changelog is talking about.

> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 33cd43dd73ee..168312ead4ae 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2667,6 +2667,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
>  				spin_unlock(&delayed_refs->lock);
>  				break;
>  			}
> +			count++;
>  
>  			/* grab the lock that says we are going to process
>  			 * all the refs for this head */
> @@ -2680,7 +2681,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
>  			 */
>  			if (ret == -EAGAIN) {
>  				locked_ref = NULL;
> -				count++;
>  				continue;
>  			}
>  		}
> @@ -2708,7 +2708,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
>  			unselect_delayed_ref_head(delayed_refs, locked_ref);
>  			locked_ref = NULL;
>  			cond_resched();
> -			count++;
>  			continue;
>  		}
>  
> @@ -2726,7 +2725,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
>  				return ret;
>  			}
>  			locked_ref = NULL;
> -			count++;
>  			continue;
>  		}
>  
> @@ -2777,7 +2775,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
>  		}
>  
>  		btrfs_put_delayed_ref(ref);
> -		count++;
>  		cond_resched();
>  	}
>  
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc
  2018-07-19 15:43   ` Nikolay Borisov
@ 2018-07-24 11:24     ` David Sterba
  0 siblings, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-07-24 11:24 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: Josef Bacik, linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 06:43:39PM +0300, Nikolay Borisov wrote:
> 
> 
> On 19.07.2018 17:49, Josef Bacik wrote:
> > If we're trying to make a data reservation and we have to allocate a
> > data chunk we could leak ret == 1, as do_chunk_alloc() will return 1 if
> > it allocated a chunk.  Since the end of the function is the success path
> > just return 0.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> 
> Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> 
> The logic flow in this function is a steaming pile of turd and is in
> dire need of refactoring...

Agreed, fixing the default return code seems like the right fix now,
compared to untangling the gotos. Patch added to misc-next.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 15/22] btrfs: run delayed iputs before committing
  2018-07-19 14:49 ` [PATCH 15/22] btrfs: run delayed iputs before committing Josef Bacik
@ 2018-08-21 13:51   ` David Sterba
  0 siblings, 0 replies; 44+ messages in thread
From: David Sterba @ 2018-08-21 13:51 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Thu, Jul 19, 2018 at 10:49:59AM -0400, Josef Bacik wrote:
> We want to have a complete picture of any delayed inode updates before
> we make the decision to commit or not, so make sure we run delayed iputs
> before making the decision to commit or not.

This basically says "we need to do X, so we do X", lacking the
explanation why.

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2018-08-21 17:11 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-19 14:49 [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper Josef Bacik
2018-07-19 14:49 ` [PATCH 02/22] btrfs: add cleanup_ref_head_accounting helper Josef Bacik
2018-07-19 14:49 ` [PATCH 03/22] btrfs: use cleanup_extent_op in check_ref_cleanup Josef Bacik
2018-07-19 14:49 ` [PATCH 04/22] btrfs: only track ref_heads in delayed_ref_updates Josef Bacik
2018-07-21  8:49   ` Nikolay Borisov
2018-07-19 14:49 ` [PATCH 05/22] btrfs: introduce delayed_refs_rsv Josef Bacik
2018-07-19 14:49 ` [PATCH 06/22] btrfs: check if free bgs for commit Josef Bacik
2018-07-19 14:49 ` [PATCH 07/22] btrfs: don't leak ret from do_chunk_alloc Josef Bacik
2018-07-19 15:43   ` Nikolay Borisov
2018-07-24 11:24     ` David Sterba
2018-07-19 14:49 ` [PATCH 08/22] btrfs: dump block_rsv whe dumping space info Josef Bacik
2018-07-19 15:39   ` Nikolay Borisov
2018-07-20 11:59   ` David Sterba
2018-07-19 14:49 ` [PATCH 09/22] btrfs: release metadata before running delayed refs Josef Bacik
2018-07-19 14:49 ` [PATCH 10/22] btrfs: alloc space cache inode with GFP_NOFS Josef Bacik
2018-07-19 15:35   ` Nikolay Borisov
2018-07-19 15:44     ` David Sterba
2018-07-19 15:56       ` Josef Bacik
2018-07-19 15:55     ` Josef Bacik
2018-07-19 14:49 ` [PATCH 11/22] btrfs: fix truncate throttling Josef Bacik
2018-07-19 14:49 ` [PATCH 12/22] btrfs: don't use global rsv for chunk allocation Josef Bacik
2018-07-19 14:49 ` [PATCH 13/22] btrfs: reset max_extent_size properly Josef Bacik
2018-07-19 14:49 ` [PATCH 14/22] btrfs: don't enospc all tickets on flush failure Josef Bacik
2018-07-19 14:49 ` [PATCH 15/22] btrfs: run delayed iputs before committing Josef Bacik
2018-08-21 13:51   ` David Sterba
2018-07-19 14:50 ` [PATCH 16/22] btrfs: loop in inode_rsv_refill Josef Bacik
2018-07-19 14:50 ` [PATCH 17/22] btrfs: don't take the dio_sem in the fsync path Josef Bacik
2018-07-19 15:12   ` Filipe Manana
2018-07-19 15:21   ` Filipe Manana
2018-07-19 15:54     ` Josef Bacik
2018-07-19 15:57       ` Filipe Manana
2018-07-19 14:50 ` [PATCH 18/22] btrfs: add ALLOC_CHUNK_FORCE to the flushing code Josef Bacik
2018-07-19 14:50 ` [PATCH 19/22] btrfs: set max_extent_size properly Josef Bacik
2018-07-19 14:50 ` [PATCH 20/22] btrfs: don't use ctl->free_space for max_extent_size Josef Bacik
2018-07-19 14:50 ` [PATCH 21/22] btrfs: reset max_extent_size on clear in a bitmap Josef Bacik
2018-07-19 14:50 ` [PATCH 22/22] btrfs: only run delayed refs if we're committing Josef Bacik
2018-07-20  7:37   ` Nikolay Borisov
2018-07-19 16:08 ` [PATCH 01/22] btrfs: add btrfs_delete_ref_head helper David Sterba
2018-07-19 16:12   ` Josef Bacik
2018-07-19 16:36     ` David Sterba
2018-07-20 13:11 ` Nikolay Borisov
2018-07-20 14:18   ` Josef Bacik
2018-07-20 14:48 ` David Sterba
2018-07-20 16:04 ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.