All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH v2] btrfs: introduce priority based delalloc shrink mechanism
Date: Thu, 13 Oct 2016 17:31:25 +0800	[thread overview]
Message-ID: <1476351085-26821-1-git-send-email-wangxg.fnst@cn.fujitsu.com> (raw)
In-Reply-To: <57FF1E50.7000706@cn.fujitsu.com>

Since commit b02441999efcc6152b87cd58e7970bb7843f76cf, we don't wait all
ordered extents, but I run into some enospc errors when doing large file
create and delete tests, it's because shrink_delalloc() does not write
enough delalloc bytes and wait them finished:
    From: Miao Xie <miaox@cn.fujitsu.com>
    Date: Mon, 4 Nov 2013 23:13:25 +0800
    Subject: [PATCH] Btrfs: don't wait for the completion of all the ordered extents

    It is very likely that there are lots of ordered extents in the filesytem,
    if we wait for the completion of all of them when we want to reclaim some
    space for the metadata space reservation, we would be blocked for a long
    time. The performance would drop down suddenly for a long time.

Here we introduce a simple reclaim_priority variable, the lower the
value, the higher the priority, 0 is the maximum priority. The core
idea is:
    delalloc_bytes = percpu_counter_sum_positive(&root->fs_info->delalloc_bytes);
    if (reclaim_priority)
        to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY - reclaim_priority));
    else
        to_reclaim = delalloc_bytes;

Here 'orig' is the number of metadata we want to reserve, and as the priority
increases, we will try wo write more delalloc bytes, meanwhile if
"reclaim_priority == 0" returns true, we'll also wait all current ordered
extents to finish.

Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c       | 63 ++++++++++++++++++++++++++------------------
 include/trace/events/btrfs.h | 11 +++-----
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e08791d..7477c25 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4686,16 +4686,18 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
 }
 
 #define EXTENT_SIZE_PER_ITEM	SZ_256K
+#define	BTRFS_DEFAULT_FLUSH_PRIORITY	3
 
 /*
  * shrink metadata reservation for delalloc
  */
-static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
-			    bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 orig,
+			    bool wait_ordered, int reclaim_priority)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
+	u64 to_reclaim;
 	u64 delalloc_bytes;
 	u64 max_reclaim;
 	long time_left;
@@ -4703,22 +4705,36 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	int loops;
 	int items;
 	enum btrfs_reserve_flush_enum flush;
+	int items_to_wait;
+
+	delalloc_bytes = percpu_counter_sum_positive(
+				&root->fs_info->delalloc_bytes);
+	if (reclaim_priority < 0)
+		reclaim_priority = 0;
+
+	if (reclaim_priority)
+		to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY -
+					reclaim_priority));
+	else
+		to_reclaim = delalloc_bytes;
 
 	/* Calc the number of the pages we need flush for space reservation */
 	items = calc_reclaim_items_nr(root, to_reclaim);
 	to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
+	if (reclaim_priority)
+		items_to_wait = items;
+	else
+		items_to_wait = -1;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
-	delalloc_bytes = percpu_counter_sum_positive(
-						&root->fs_info->delalloc_bytes);
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
 		if (wait_ordered)
-			btrfs_wait_ordered_roots(root->fs_info, items,
+			btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
 						 0, (u64)-1);
 		return;
 	}
@@ -4763,7 +4779,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_roots(root->fs_info, items,
+			btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
 						 0, (u64)-1);
 		} else {
 			time_left = schedule_timeout_killable(1);
@@ -4836,7 +4852,7 @@ struct reserve_ticket {
 
 static int flush_space(struct btrfs_root *root,
 		       struct btrfs_space_info *space_info, u64 num_bytes,
-		       u64 orig_bytes, int state)
+		       int state, int reclaim_priority)
 {
 	struct btrfs_trans_handle *trans;
 	int nr;
@@ -4860,8 +4876,8 @@ static int flush_space(struct btrfs_root *root,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(root, num_bytes * 2, orig_bytes,
-				state == FLUSH_DELALLOC_WAIT);
+		shrink_delalloc(root, num_bytes, state == FLUSH_DELALLOC_WAIT,
+				reclaim_priority);
 		break;
 	case ALLOC_CHUNK:
 		trans = btrfs_join_transaction(root);
@@ -4877,7 +4893,7 @@ static int flush_space(struct btrfs_root *root,
 			ret = 0;
 		break;
 	case COMMIT_TRANS:
-		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+		ret = may_commit_transaction(root, space_info, num_bytes, 0);
 		break;
 	default:
 		ret = -ENOSPC;
@@ -4885,7 +4901,7 @@ static int flush_space(struct btrfs_root *root,
 	}
 
 	trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
-				orig_bytes, state, ret);
+				state, ret);
 	return ret;
 }
 
@@ -5008,8 +5024,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 	struct btrfs_space_info *space_info;
 	u64 to_reclaim;
 	int flush_state;
-	int commit_cycles = 0;
 	u64 last_tickets_id;
+	int reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
 
 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -5030,8 +5046,11 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		struct reserve_ticket *ticket;
 		int ret;
 
+		if (flush_state > COMMIT_TRANS)
+			flush_state = FLUSH_DELAYED_ITEMS_NR;
 		ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
-			    to_reclaim, flush_state);
+				  flush_state, reclaim_priority);
+
 		spin_lock(&space_info->lock);
 		if (!ret)
 			try_to_wake_tickets(fs_info->fs_root, space_info);
@@ -5049,21 +5068,15 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		} else {
 			last_tickets_id = space_info->tickets_id;
 			flush_state = FLUSH_DELAYED_ITEMS_NR;
-			if (commit_cycles)
-				commit_cycles--;
+			reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
 		}
 
-		if (flush_state > COMMIT_TRANS) {
-			commit_cycles++;
-			if (commit_cycles > 2) {
-				wake_all_tickets(&space_info->tickets);
-				space_info->flush = 0;
-			} else {
-				flush_state = FLUSH_DELAYED_ITEMS_NR;
-			}
+		if (flush_state > COMMIT_TRANS && reclaim_priority == 0) {
+			wake_all_tickets(&space_info->tickets);
+			space_info->flush = 0;
 		}
 		spin_unlock(&space_info->lock);
-	} while (flush_state <= COMMIT_TRANS);
+	} while ((flush_state <= COMMIT_TRANS) || (--reclaim_priority >= 0));
 }
 
 void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -5089,7 +5102,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 
 	do {
 		flush_space(fs_info->fs_root, space_info, to_reclaim,
-			    to_reclaim, flush_state);
+			    flush_state, 1);
 		flush_state++;
 		spin_lock(&space_info->lock);
 		if (ticket->bytes == 0) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index e030d6f..7d953a6 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -857,15 +857,14 @@ TRACE_EVENT(btrfs_trigger_flush,
 TRACE_EVENT(btrfs_flush_space,
 
 	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
-		 u64 orig_bytes, int state, int ret),
+		 int state, int ret),
 
-	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
+	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
 	TP_STRUCT__entry(
 		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
-		__field(	u64,	orig_bytes		)
 		__field(	int,	state			)
 		__field(	int,	ret			)
 	),
@@ -874,19 +873,17 @@ TRACE_EVENT(btrfs_flush_space,
 		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
-		__entry->orig_bytes	=	orig_bytes;
 		__entry->state		=	state;
 		__entry->ret		=	ret;
 	),
 
 	TP_printk("%pU: state = %d(%s), flags = %llu(%s), num_bytes = %llu, "
-		  "orig_bytes = %llu, ret = %d", __entry->fsid, __entry->state,
+		  "ret = %d", __entry->fsid, __entry->state,
 		  show_flush_state(__entry->state),
 		  (unsigned long long)__entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->num_bytes,
-		  (unsigned long long)__entry->orig_bytes, __entry->ret)
+		  (unsigned long long)__entry->num_bytes, __entry->ret)
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
-- 
2.7.4




  reply	other threads:[~2016-10-13  9:37 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-12  9:03 [PATCH 1/2] btrfs: introduce priority based delalloc shrink mechanism Wang Xiaoguang
2016-10-12  9:03 ` [PATCH 2/2] btrfs: try to satisfy metadata requests if wen can overcommit Wang Xiaoguang
2016-10-12 17:21   ` Josef Bacik
2016-10-12 17:20 ` [PATCH 1/2] btrfs: introduce priority based delalloc shrink mechanism Josef Bacik
2016-10-13  5:40   ` Wang Xiaoguang
2016-10-13  9:31     ` Wang Xiaoguang [this message]
2016-10-24 15:43       ` [PATCH v2] " David Sterba
2016-11-22 19:35         ` Chris Mason

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1476351085-26821-1-git-send-email-wangxg.fnst@cn.fujitsu.com \
    --to=wangxg.fnst@cn.fujitsu.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.