From: Josef Bacik <josef@toxicpanda.com>
To: Dennis Zhou <dennis@kernel.org>
Cc: David Sterba <dsterba@suse.com>, Chris Mason <clm@fb.com>,
Josef Bacik <josef@toxicpanda.com>,
Omar Sandoval <osandov@osandov.com>,
kernel-team@fb.com, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH 07/19] btrfs: discard one region at a time in async discard
Date: Thu, 10 Oct 2019 11:22:44 -0400 [thread overview]
Message-ID: <20191010152243.guevccs7bacoop2m@macbook-pro-91.dhcp.thefacebook.com> (raw)
In-Reply-To: <79b26926486575eb53825c1aad607e99028d8447.1570479299.git.dennis@kernel.org>
On Mon, Oct 07, 2019 at 04:17:38PM -0400, Dennis Zhou wrote:
> The prior two patches added discarding via a background workqueue. This
> just piggybacked off of the fstrim code to trim the whole block at once.
> Well inevitably this is worse performance wise and will aggressively
> overtrim. But it was nice to plumb the other infrastructure to keep the
> patches easier to review.
>
> This adds the real goal of this series which is discarding slowly (ie a
> slow long running fstrim). The discarding is split into two phases,
> extents and then bitmaps. The reason for this is two fold. First, the
> bitmap regions overlap the extent regions. Second, discarding the
> extents first will let the newly trimmed bitmaps have the highest chance
> of coalescing when being readded to the free space cache.
>
> Signed-off-by: Dennis Zhou <dennis@kernel.org>
> ---
> fs/btrfs/block-group.h | 2 +
> fs/btrfs/discard.c | 73 ++++++++++++++++++++-----
> fs/btrfs/discard.h | 16 ++++++
> fs/btrfs/extent-tree.c | 3 +-
> fs/btrfs/free-space-cache.c | 106 ++++++++++++++++++++++++++----------
> fs/btrfs/free-space-cache.h | 6 +-
> 6 files changed, 159 insertions(+), 47 deletions(-)
>
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index 0f9a1c91753f..b59e6a8ed73d 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -120,6 +120,8 @@ struct btrfs_block_group_cache {
> struct list_head discard_list;
> int discard_index;
> u64 discard_delay;
> + u64 discard_cursor;
> + u32 discard_flags;
>
Same comment as the free space flags, this is just a state holder and never has
more than one bit set, so switch it to an enum and treat it like a state.
> /* For dirty block groups */
> struct list_head dirty_list;
> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> index fb92b888774d..26a1e44b4bfa 100644
> --- a/fs/btrfs/discard.c
> +++ b/fs/btrfs/discard.c
> @@ -22,21 +22,28 @@ btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl,
> return &discard_ctl->discard_list[cache->discard_index];
> }
>
> -void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> - struct btrfs_block_group_cache *cache)
> +static void __btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> + struct btrfs_block_group_cache *cache)
> {
> u64 now = ktime_get_ns();
>
> - spin_lock(&discard_ctl->lock);
> -
> if (list_empty(&cache->discard_list) || !cache->discard_index) {
> if (!cache->discard_index)
> cache->discard_index = 1;
> cache->discard_delay = now + BTRFS_DISCARD_DELAY;
> + cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR;
> }
>
> list_move_tail(&cache->discard_list,
> btrfs_get_discard_list(discard_ctl, cache));
> +}
> +
> +void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> + struct btrfs_block_group_cache *cache)
> +{
> + spin_lock(&discard_ctl->lock);
> +
> + __btrfs_add_to_discard_list(discard_ctl, cache);
>
> spin_unlock(&discard_ctl->lock);
> }
> @@ -53,6 +60,7 @@ void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
>
> cache->discard_index = 0;
> cache->discard_delay = now;
> + cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR;
> list_add_tail(&cache->discard_list, &discard_ctl->discard_list[0]);
>
> spin_unlock(&discard_ctl->lock);
> @@ -114,13 +122,24 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
>
> spin_lock(&discard_ctl->lock);
>
> +again:
> cache = find_next_cache(discard_ctl, now);
>
> - if (cache && now < cache->discard_delay)
> + if (cache && now > cache->discard_delay) {
> + discard_ctl->cache = cache;
> + if (cache->discard_index == 0 &&
> + cache->free_space_ctl->free_space != cache->key.offset) {
> + __btrfs_add_to_discard_list(discard_ctl, cache);
> + goto again;
The magic number thing again, it needs to be discard_index == UNUSED_DISCARD or
some such descriptive thing. Also needs to be btrfs_block_group_used(cache) ==
0;
> + }
> + if (btrfs_discard_reset_cursor(cache)) {
> + cache->discard_cursor = cache->key.objectid;
> + cache->discard_flags &= ~(BTRFS_DISCARD_RESET_CURSOR |
> + BTRFS_DISCARD_BITMAPS);
> + }
> + } else {
> cache = NULL;
> -
> - discard_ctl->cache = cache;
> -
> + }
> spin_unlock(&discard_ctl->lock);
>
> return cache;
> @@ -173,18 +192,42 @@ static void btrfs_discard_workfn(struct work_struct *work)
>
> discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
>
> +again:
> cache = peek_discard_list(discard_ctl);
> if (!cache || !btrfs_run_discard_work(discard_ctl))
> return;
>
> - btrfs_trim_block_group(cache, &trimmed, cache->key.objectid,
> - btrfs_block_group_end(cache), 0);
> + if (btrfs_discard_bitmaps(cache))
> + btrfs_trim_block_group_bitmaps(cache, &trimmed,
> + cache->discard_cursor,
> + btrfs_block_group_end(cache),
> + 0, true);
> + else
> + btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor,
> + btrfs_block_group_end(cache), 0, true);
> +
> + if (cache->discard_cursor >= btrfs_block_group_end(cache)) {
> + if (btrfs_discard_bitmaps(cache)) {
> + remove_from_discard_list(discard_ctl, cache);
> + if (btrfs_is_free_space_trimmed(cache))
> + btrfs_mark_bg_unused(cache);
> + else if (cache->free_space_ctl->free_space ==
> + cache->key.offset)
btrfs_block_group_used(cache) == 0;
> + btrfs_add_to_discard_free_list(discard_ctl,
> + cache);
> + } else {
> + cache->discard_cursor = cache->key.objectid;
> + cache->discard_flags |= BTRFS_DISCARD_BITMAPS;
> + }
> + }
> +
> + spin_lock(&discard_ctl->lock);
> + discard_ctl->cache = NULL;
> + spin_unlock(&discard_ctl->lock);
>
> - remove_from_discard_list(discard_ctl, cache);
> - if (btrfs_is_free_space_trimmed(cache))
> - btrfs_mark_bg_unused(cache);
> - else if (cache->free_space_ctl->free_space == cache->key.offset)
> - btrfs_add_to_discard_free_list(discard_ctl, cache);
> + /* we didn't trim anything but we really ought to so try again */
> + if (trimmed == 0)
> + goto again;
Why? We'll reschedule if we need to. We unconditionally do this, I feel like
there's going to be some corner case where we end up seeing this workfn using
100% on a bunch of sandcastle boxes and we'll all be super sad.
>
> btrfs_discard_schedule_work(discard_ctl, false);
> }
> diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
> index 55f79b624943..22cfa7e401bb 100644
> --- a/fs/btrfs/discard.h
> +++ b/fs/btrfs/discard.h
> @@ -13,6 +13,22 @@
> #include "block-group.h"
> #include "free-space-cache.h"
>
> +/* discard flags */
> +#define BTRFS_DISCARD_RESET_CURSOR (1UL << 0)
> +#define BTRFS_DISCARD_BITMAPS (1UL << 1)
> +
> +static inline
> +bool btrfs_discard_reset_cursor(struct btrfs_block_group_cache *cache)
> +{
> + return (cache->discard_flags & BTRFS_DISCARD_RESET_CURSOR);
> +}
> +
> +static inline
> +bool btrfs_discard_bitmaps(struct btrfs_block_group_cache *cache)
> +{
> + return (cache->discard_flags & BTRFS_DISCARD_BITMAPS);
> +}
> +
> void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> struct btrfs_block_group_cache *cache);
> void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d69ee5f51b38..ff42e4abb01d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -5683,7 +5683,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
> &group_trimmed,
> start,
> end,
> - range->minlen);
> + range->minlen,
> + false);
>
> trimmed += group_trimmed;
> if (ret) {
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index ed0e7ee4c78d..97b3074e83c0 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -3267,7 +3267,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
> }
>
> static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> - u64 *total_trimmed, u64 start, u64 end, u64 minlen)
> + u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> + bool async)
> {
> struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> struct btrfs_free_space *entry;
> @@ -3284,36 +3285,25 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> mutex_lock(&ctl->cache_writeout_mutex);
> spin_lock(&ctl->tree_lock);
>
> - if (ctl->free_space < minlen) {
> - spin_unlock(&ctl->tree_lock);
> - mutex_unlock(&ctl->cache_writeout_mutex);
> - break;
> - }
> + if (ctl->free_space < minlen)
> + goto out_unlock;
>
> entry = tree_search_offset(ctl, start, 0, 1);
> - if (!entry) {
> - spin_unlock(&ctl->tree_lock);
> - mutex_unlock(&ctl->cache_writeout_mutex);
> - break;
> - }
> + if (!entry)
> + goto out_unlock;
>
> /* skip bitmaps */
> - while (entry->bitmap) {
> + while (entry->bitmap || (async &&
> + btrfs_free_space_trimmed(entry))) {
Update the comment to say we're skipping already trimmed entries as well please.
> node = rb_next(&entry->offset_index);
> - if (!node) {
> - spin_unlock(&ctl->tree_lock);
> - mutex_unlock(&ctl->cache_writeout_mutex);
> - goto out;
> - }
> + if (!node)
> + goto out_unlock;
> entry = rb_entry(node, struct btrfs_free_space,
> offset_index);
> }
>
> - if (entry->offset >= end) {
> - spin_unlock(&ctl->tree_lock);
> - mutex_unlock(&ctl->cache_writeout_mutex);
> - break;
> - }
> + if (entry->offset >= end)
> + goto out_unlock;
>
> extent_start = entry->offset;
> extent_bytes = entry->bytes;
> @@ -3338,10 +3328,15 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> ret = do_trimming(block_group, total_trimmed, start, bytes,
> extent_start, extent_bytes, extent_flags,
> &trim_entry);
> - if (ret)
> + if (ret) {
> + block_group->discard_cursor = start + bytes;
> break;
> + }
> next:
> start += bytes;
> + block_group->discard_cursor = start;
> + if (async && *total_trimmed)
> + break;
Alright so this means we'll only trim one entry and then return if we're async?
It seems to be the same below for bitmaps. This deserves a comment for the
functions, it fundamentally changes the behavior of the function if we're async.
>
> if (fatal_signal_pending(current)) {
> ret = -ERESTARTSYS;
> @@ -3350,7 +3345,14 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
>
> cond_resched();
> }
> -out:
> +
> + return ret;
> +
> +out_unlock:
> + block_group->discard_cursor = btrfs_block_group_end(block_group);
> + spin_unlock(&ctl->tree_lock);
> + mutex_unlock(&ctl->cache_writeout_mutex);
> +
> return ret;
> }
>
> @@ -3390,7 +3392,8 @@ static void end_trimming_bitmap(struct btrfs_free_space *entry)
> }
>
> static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> - u64 *total_trimmed, u64 start, u64 end, u64 minlen)
> + u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> + bool async)
> {
> struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> struct btrfs_free_space *entry;
> @@ -3407,13 +3410,16 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> spin_lock(&ctl->tree_lock);
>
> if (ctl->free_space < minlen) {
> + block_group->discard_cursor =
> + btrfs_block_group_end(block_group);
> spin_unlock(&ctl->tree_lock);
> mutex_unlock(&ctl->cache_writeout_mutex);
> break;
> }
>
> entry = tree_search_offset(ctl, offset, 1, 0);
> - if (!entry) {
> + if (!entry || (async && start == offset &&
> + btrfs_free_space_trimmed(entry))) {
> spin_unlock(&ctl->tree_lock);
> mutex_unlock(&ctl->cache_writeout_mutex);
> next_bitmap = true;
> @@ -3446,6 +3452,16 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> goto next;
> }
>
> + /*
> + * We already trimmed a region, but are using the locking above
> + * to reset the BTRFS_FSC_TRIMMING_BITMAP flag.
> + */
> + if (async && *total_trimmed) {
> + spin_unlock(&ctl->tree_lock);
> + mutex_unlock(&ctl->cache_writeout_mutex);
> + return ret;
> + }
> +
> bytes = min(bytes, end - start);
> if (bytes < minlen) {
> entry->flags &= ~BTRFS_FSC_TRIMMING_BITMAP;
> @@ -3468,6 +3484,8 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> start, bytes, 0, &trim_entry);
> if (ret) {
> reset_trimming_bitmap(ctl, offset);
> + block_group->discard_cursor =
> + btrfs_block_group_end(block_group);
> break;
> }
> next:
> @@ -3477,6 +3495,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> } else {
> start += bytes;
> }
> + block_group->discard_cursor = start;
>
> if (fatal_signal_pending(current)) {
> if (start != offset)
> @@ -3488,6 +3507,9 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> cond_resched();
> }
>
> + if (offset >= end)
> + block_group->discard_cursor = end;
> +
> return ret;
> }
>
> @@ -3532,7 +3554,8 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
> }
>
> int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> - u64 *trimmed, u64 start, u64 end, u64 minlen)
> + u64 *trimmed, u64 start, u64 end, u64 minlen,
> + bool async)
> {
> struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> int ret;
> @@ -3547,11 +3570,11 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> btrfs_get_block_group_trimming(block_group);
> spin_unlock(&block_group->lock);
>
> - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
> - if (ret)
> + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
> + if (ret || async)
> goto out;
>
You already separate out btrfs_trim_block_group_bitmaps, so this function really
only trims the whole block group if !async. Make a separate helper for trimming
only the extents so it's clear what the async stuff is doing, and we're not
relying on the async to change the behavior of btrfs_trim_block_group().
Thanks,
Josef
next prev parent reply other threads:[~2019-10-10 15:22 UTC|newest]
Thread overview: 71+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-07 20:17 [RFC PATCH 00/19] btrfs: async discard support Dennis Zhou
2019-10-07 20:17 ` [PATCH 01/19] bitmap: genericize percpu bitmap region iterators Dennis Zhou
2019-10-07 20:26 ` Josef Bacik
2019-10-07 22:24 ` Dennis Zhou
2019-10-15 12:11 ` David Sterba
2019-10-15 18:35 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 02/19] btrfs: rename DISCARD opt to DISCARD_SYNC Dennis Zhou
2019-10-07 20:27 ` Josef Bacik
2019-10-08 11:12 ` Johannes Thumshirn
2019-10-11 9:19 ` Nikolay Borisov
2019-10-07 20:17 ` [PATCH 03/19] btrfs: keep track of which extents have been discarded Dennis Zhou
2019-10-07 20:37 ` Josef Bacik
2019-10-07 22:38 ` Dennis Zhou
2019-10-10 13:40 ` Josef Bacik
2019-10-11 16:15 ` Dennis Zhou
2019-10-08 12:46 ` Nikolay Borisov
2019-10-11 16:08 ` Dennis Zhou
2019-10-15 12:17 ` David Sterba
2019-10-15 19:58 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 04/19] btrfs: keep track of cleanliness of the bitmap Dennis Zhou
2019-10-10 14:16 ` Josef Bacik
2019-10-11 16:17 ` Dennis Zhou
2019-10-15 12:23 ` David Sterba
2019-10-07 20:17 ` [PATCH 05/19] btrfs: add the beginning of async discard, discard workqueue Dennis Zhou
2019-10-10 14:38 ` Josef Bacik
2019-10-15 12:49 ` David Sterba
2019-10-15 19:57 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 06/19] btrfs: handle empty block_group removal Dennis Zhou
2019-10-10 15:00 ` Josef Bacik
2019-10-11 16:52 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 07/19] btrfs: discard one region at a time in async discard Dennis Zhou
2019-10-10 15:22 ` Josef Bacik [this message]
2019-10-14 19:42 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 08/19] btrfs: track discardable extents for asnyc discard Dennis Zhou
2019-10-10 15:36 ` Josef Bacik
2019-10-14 19:50 ` Dennis Zhou
2019-10-15 13:12 ` David Sterba
2019-10-15 18:41 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 09/19] btrfs: keep track of discardable_bytes Dennis Zhou
2019-10-10 15:38 ` Josef Bacik
2019-10-07 20:17 ` [PATCH 10/19] btrfs: calculate discard delay based on number of extents Dennis Zhou
2019-10-10 15:41 ` Josef Bacik
2019-10-11 18:07 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 11/19] btrfs: add bps discard rate limit Dennis Zhou
2019-10-10 15:47 ` Josef Bacik
2019-10-14 19:56 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 12/19] btrfs: limit max discard size for async discard Dennis Zhou
2019-10-10 16:16 ` Josef Bacik
2019-10-14 19:57 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 13/19] btrfs: have multiple discard lists Dennis Zhou
2019-10-10 16:51 ` Josef Bacik
2019-10-14 20:04 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 14/19] btrfs: only keep track of data extents for async discard Dennis Zhou
2019-10-10 16:53 ` Josef Bacik
2019-10-07 20:17 ` [PATCH 15/19] btrfs: load block_groups into discard_list on mount Dennis Zhou
2019-10-10 17:11 ` Josef Bacik
2019-10-14 20:17 ` Dennis Zhou
2019-10-14 23:38 ` David Sterba
2019-10-15 15:42 ` Dennis Zhou
2019-10-07 20:17 ` [PATCH 16/19] btrfs: keep track of discard reuse stats Dennis Zhou
2019-10-10 17:13 ` Josef Bacik
2019-10-07 20:17 ` [PATCH 17/19] btrfs: add async discard header Dennis Zhou
2019-10-10 17:13 ` Josef Bacik
2019-10-07 20:17 ` [PATCH 18/19] btrfs: increase the metadata allowance for the free_space_cache Dennis Zhou
2019-10-10 17:16 ` Josef Bacik
2019-10-07 20:17 ` [PATCH 19/19] btrfs: make smaller extents more likely to go into bitmaps Dennis Zhou
2019-10-10 17:17 ` Josef Bacik
2019-10-11 7:49 ` [RFC PATCH 00/19] btrfs: async discard support Nikolay Borisov
2019-10-14 21:05 ` Dennis Zhou
2019-10-15 12:08 ` David Sterba
2019-10-15 15:41 ` Dennis Zhou
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20191010152243.guevccs7bacoop2m@macbook-pro-91.dhcp.thefacebook.com \
--to=josef@toxicpanda.com \
--cc=clm@fb.com \
--cc=dennis@kernel.org \
--cc=dsterba@suse.com \
--cc=kernel-team@fb.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=osandov@osandov.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).