The delayed refs rsv patches exposed a bunch of issues in our enospc infrastructure that needed to be addressed. These aren't really one coherent group, but they are all around flushing and reservations. may_commit_transaction() needed to be updated a little bit, and we needed to add a new state to force chunk allocation if things got dicey. Also because we can end up needed to reserve a whole bunch of extra space for outstanding delayed refs we needed to add the ability to only ENOSPC tickets that were too big to satisfy, instead of failing all of the tickets. There's also a fix in here for one of the corner cases where we didn't quite have enough space reserved for the delayed refs we were generating during evict(). Thanks, Josef
may_commit_transaction will skip committing the transaction if we don't have enough pinned space or if we're trying to find space for a SYSTEM chunk. However if we have pending free block groups in this transaction we still want to commit as we may be able to allocate a chunk to make our reservation. So instead of just returning ENOSPC, check if we have free block groups pending, and if so commit the transaction to allow us to use that free space. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Omar Sandoval <osandov@fb.com> --- fs/btrfs/extent-tree.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8af68b13fa27..0dca250dc02e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4843,10 +4843,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes) return 0; - /* See if there is enough pinned space to make this reservation */ - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have bg's that are going to be freed, allowing us to possibly do a + * chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* @@ -4854,7 +4862,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * this reservation. */ if (space_info != delayed_rsv->space_info) - return -ENOSPC; + goto enospc; spin_lock(&delayed_rsv->lock); reclaim_bytes += delayed_rsv->reserved; @@ -4868,17 +4876,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, bytes -= reclaim_bytes; if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { - return -ENOSPC; - } - + bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; commit: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return -ENOSPC; - return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; } /* -- 2.14.3
For enospc_debug having the block rsvs is super helpful to see if we've done something wrong. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> --- fs/btrfs/extent-tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0dca250dc02e..7a30fbc05e5e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8052,6 +8052,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, return ret; } +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups) @@ -8071,6 +8080,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info, info->bytes_readonly); spin_unlock(&info->lock); + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + if (!dump_block_groups) return; -- 2.14.3
We've done this forever because of the voodoo around knowing how much space we have. However we have better ways of doing this now, and on normal file systems we'll easily have a global reserve of 512MiB, and since metadata chunks are usually 1GiB that means we'll allocate metadata chunks more readily. Instead use the actual used amount when determining if we need to allocate a chunk or not. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/extent-tree.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a30fbc05e5e..a91b3183dcae 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4388,21 +4388,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) return 1; - /* - * We need to take into account the global rsv because for all intents - * and purposes it's used space. Don't worry about locking the - * global_rsv, it doesn't change except when the transaction commits. - */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - bytes_used += calc_global_rsv_need_space(global_rsv); - /* * in limited mode, we want to have some free space up to * about 1% of the FS size. -- 2.14.3
With my change to no longer take into account the global reserve for metadata allocation chunks we have this side-effect for mixed block group fs'es where we are no longer allocating enough chunks for the data/metadata requirements. To deal with this add a ALLOC_CHUNK_FORCE step to the flushing state machine. This will only get used if we've already made a full loop through the flushing machinery and tried committing the transaction. If we have then we can try and force a chunk allocation since we likely need it to make progress. This resolves the issues I was seeing with the mixed bg tests in xfstests with my previous patch. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 18 +++++++++++++++++- include/trace/events/btrfs.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0c6d589c8ce4..8ccc5019172b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2750,7 +2750,8 @@ enum btrfs_flush_state { FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, - COMMIT_TRANS = 8, + ALLOC_CHUNK_FORCE = 8, + COMMIT_TRANS = 9, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a91b3183dcae..e6bb6ce23c84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4927,6 +4927,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_end_transaction(trans); break; case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4934,7 +4935,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, } ret = do_chunk_alloc(trans, btrfs_metadata_alloc_profile(fs_info), - CHUNK_ALLOC_NO_FORCE); + (state == ALLOC_CHUNK) ? + CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; @@ -5070,6 +5073,19 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) commit_cycles--; } + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * free'd up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 63d1f9d8b8c7..dd0e6f8d6b6e 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1051,6 +1051,7 @@ TRACE_EVENT(btrfs_trigger_flush, { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ + { ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \ { COMMIT_TRANS, "COMMIT_TRANS"}) TRACE_EVENT(btrfs_flush_space, -- 2.14.3
With the introduction of the per-inode block_rsv it became possible to have really really large reservation requests made because of data fragmentation. Since the ticket stuff assumed that we'd always have relatively small reservation requests it just killed all tickets if we were unable to satisfy the current request. However this is generally not the case anymore. So fix this logic to instead see if we had a ticket that we were able to give some reservation to, and if we were continue the flushing loop again. Likewise we make the tickets use the space_info_add_old_bytes() method of returning what reservation they did receive in hopes that it could satisfy reservations down the line. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e6bb6ce23c84..983d086fa768 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4791,6 +4791,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, } struct reserve_ticket { + u64 orig_bytes; u64 bytes; int error; struct list_head list; @@ -5012,7 +5013,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static void wake_all_tickets(struct list_head *head) +static bool wake_all_tickets(struct list_head *head) { struct reserve_ticket *ticket; @@ -5021,7 +5022,10 @@ static void wake_all_tickets(struct list_head *head) list_del_init(&ticket->list); ticket->error = -ENOSPC; wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; } + return false; } /* @@ -5089,8 +5093,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { - wake_all_tickets(&space_info->tickets); - space_info->flush = 0; + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; } @@ -5142,10 +5150,11 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, u64 orig_bytes) + struct reserve_ticket *ticket) { DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; int ret = 0; spin_lock(&space_info->lock); @@ -5166,14 +5175,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, ret = ticket->error; if (!list_empty(&ticket->list)) list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket->bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - } + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; spin_unlock(&space_info->lock); + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); return ret; } @@ -5199,6 +5206,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { struct reserve_ticket ticket; u64 used; + u64 reclaim_bytes = 0; int ret = 0; ASSERT(orig_bytes); @@ -5234,6 +5242,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; ticket.bytes = orig_bytes; ticket.error = 0; init_waitqueue_head(&ticket.wait); @@ -5274,25 +5283,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket, - orig_bytes); + return wait_reserve_ticket(fs_info, space_info, &ticket); ret = 0; priority_reclaim_metadata_space(fs_info, space_info, &ticket); spin_lock(&space_info->lock); if (ticket.bytes) { - if (ticket.bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket.bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 0); - - } + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; list_del_init(&ticket.list); ret = -ENOSPC; } spin_unlock(&space_info->lock); + + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); ASSERT(list_empty(&ticket.list)); return ret; } -- 2.14.3
With severe fragmentation we can end up with our inode rsv size being huge during writeout, which would cause us to need to make very large metadata reservations. However we may not actually need that much once writeout is complete. So instead try to make our reservation, and if we couldn't make it re-calculate our new reservation size and try again. If our reservation size doesn't change between tries then we know we are actually out of space and can error out. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/extent-tree.c | 56 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 983d086fa768..0e9ba77e5316 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5776,6 +5776,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +static inline void __get_refill_bytes(struct btrfs_block_rsv *block_rsv, + u64 *metadata_bytes, u64 *qgroup_bytes) +{ + *metadata_bytes = 0; + *qgroup_bytes = 0; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + *metadata_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + *qgroup_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; + spin_unlock(&block_rsv->lock); +} + /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. @@ -5791,25 +5806,37 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes = 0; + u64 num_bytes = 0, last = 0; u64 qgroup_num_bytes = 0; int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - num_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - qgroup_num_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); - + __get_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + do { + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); + if (ret) + return ret; + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + last = num_bytes; + /* + * If we are fragmented we can end up with a lot of + * outstanding extents which will make our size be much + * larger than our reserved amount. If we happen to + * try to do a reservation here that may result in us + * trying to do a pretty hefty reservation, which we may + * not need once delalloc flushing happens. If this is + * the case try and do the reserve again. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + __get_refill_bytes(block_rsv, &num_bytes, + &qgroup_num_bytes); + } + } while (ret && last != num_bytes); + if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5819,8 +5846,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, spin_lock(&block_rsv->lock); block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; spin_unlock(&block_rsv->lock); - } else - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + } return ret; } -- 2.14.3
For FLUSH_LIMIT flushers we really can only allocate chunks and flush delayed inode items, everything else is problematic. I added a bunch of new states and it lead to weirdness in the FLUSH_LIMIT case because I forgot about how it worked. So instead explicitly declare the states that are ok for flushing with FLUSH_LIMIT and use that for our state machine. Then as we add new things that are safe we can just add them to this list. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/extent-tree.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0e9ba77e5316..e31980d451c2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5112,12 +5112,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) INIT_WORK(work, btrfs_async_reclaim_metadata_space); } +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { u64 to_reclaim; - int flush_state = FLUSH_DELAYED_ITEMS_NR; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, @@ -5129,7 +5135,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -5137,15 +5144,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - - /* - * Priority flushers can't wait on delalloc without - * deadlocking. - */ - if (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT) - flush_state = ALLOC_CHUNK; - } while (flush_state < COMMIT_TRANS); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); } static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, -- 2.14.3
We could generate a lot of delayed refs in evict but never have any left over space from our block rsv to make up for that fact. So reserve some extra space and give it to the transaction so it can be used to refill the delayed refs rsv every loop through the truncate path. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/inode.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cae30f6c095f..3da9ac463344 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5258,13 +5258,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); int failures = 0; for (;;) { struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, rsv->size, + ret = btrfs_block_rsv_refill(root, rsv, + rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5273,9 +5275,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, return ERR_PTR(-ENOSPC); } + /* + * Evict can generate a large amount of delayed refs without + * having a way to add space back since we exhaust our temporary + * block rsv. We aren't allowed to do FLUSH_ALL in this case + * because we could deadlock with so many things in the flushing + * code, so we have to try and hold some extra space to + * compensate for our delayed ref generation. If we can't get + * that space then we need see if we can steal our minimum from + * the global reserve. We will be ratelimited by the amount of + * space we have for the delayed refs rsv, so we'll end up + * committing and trying again. + */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) + if (IS_ERR(trans) || !ret) { + if (!IS_ERR(trans)) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); + } return trans; + } /* * Try to steal from the global reserve if there is space for -- 2.14.3
On 21.11.18 г. 21:03 ч., Josef Bacik wrote: > may_commit_transaction will skip committing the transaction if we don't > have enough pinned space or if we're trying to find space for a SYSTEM > chunk. However if we have pending free block groups in this transaction > we still want to commit as we may be able to allocate a chunk to make > our reservation. So instead of just returning ENOSPC, check if we have > free block groups pending, and if so commit the transaction to allow us > to use that free space. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > Reviewed-by: Omar Sandoval <osandov@fb.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> > --- > fs/btrfs/extent-tree.c | 33 +++++++++++++++++++-------------- > 1 file changed, 19 insertions(+), 14 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 8af68b13fa27..0dca250dc02e 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -4843,10 +4843,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, > if (!bytes) > return 0; > > - /* See if there is enough pinned space to make this reservation */ > - if (__percpu_counter_compare(&space_info->total_bytes_pinned, > - bytes, > - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) > + trans = btrfs_join_transaction(fs_info->extent_root); > + if (IS_ERR(trans)) > + return PTR_ERR(trans); > + > + /* > + * See if there is enough pinned space to make this reservation, or if > + * we have bg's that are going to be freed, allowing us to possibly do a > + * chunk allocation the next loop through. > + */ > + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || > + __percpu_counter_compare(&space_info->total_bytes_pinned, bytes, > + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) > goto commit; > > /* > @@ -4854,7 +4862,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, > * this reservation. > */ > if (space_info != delayed_rsv->space_info) > - return -ENOSPC; > + goto enospc; > > spin_lock(&delayed_rsv->lock); > reclaim_bytes += delayed_rsv->reserved; > @@ -4868,17 +4876,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, > bytes -= reclaim_bytes; > > if (__percpu_counter_compare(&space_info->total_bytes_pinned, > - bytes, > - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { > - return -ENOSPC; > - } > - > + bytes, > + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) > + goto enospc; > commit: > - trans = btrfs_join_transaction(fs_info->extent_root); > - if (IS_ERR(trans)) > - return -ENOSPC; > - > return btrfs_commit_transaction(trans); > +enospc: > + btrfs_end_transaction(trans); > + return -ENOSPC; > } > > /* >
On 21.11.18 г. 21:03 ч., Josef Bacik wrote: > We've done this forever because of the voodoo around knowing how much > space we have. However we have better ways of doing this now, and on > normal file systems we'll easily have a global reserve of 512MiB, and > since metadata chunks are usually 1GiB that means we'll allocate > metadata chunks more readily. Instead use the actual used amount when > determining if we need to allocate a chunk or not. This explanation could use more concrete wording currently it's way too "hand wavy"/vague. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > --- > fs/btrfs/extent-tree.c | 9 --------- > 1 file changed, 9 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 7a30fbc05e5e..a91b3183dcae 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -4388,21 +4388,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) > static int should_alloc_chunk(struct btrfs_fs_info *fs_info, > struct btrfs_space_info *sinfo, int force) > { > - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; > u64 bytes_used = btrfs_space_info_used(sinfo, false); > u64 thresh; > > if (force == CHUNK_ALLOC_FORCE) > return 1; > > - /* > - * We need to take into account the global rsv because for all intents > - * and purposes it's used space. Don't worry about locking the > - * global_rsv, it doesn't change except when the transaction commits. > - */ > - if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) > - bytes_used += calc_global_rsv_need_space(global_rsv); > - > /* > * in limited mode, we want to have some free space up to > * about 1% of the FS size. >
On 21.11.18 г. 21:03 ч., Josef Bacik wrote: > With my change to no longer take into account the global reserve for > metadata allocation chunks we have this side-effect for mixed block > group fs'es where we are no longer allocating enough chunks for the > data/metadata requirements. To deal with this add a ALLOC_CHUNK_FORCE > step to the flushing state machine. This will only get used if we've > already made a full loop through the flushing machinery and tried > committing the transaction. If we have then we can try and force a > chunk allocation since we likely need it to make progress. This > resolves the issues I was seeing with the mixed bg tests in xfstests > with my previous patch. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Still, my observation is that the metadata reclaim code is increasing in complexity for rather niche use cases or the details become way too subtle. > --- > fs/btrfs/ctree.h | 3 ++- > fs/btrfs/extent-tree.c | 18 +++++++++++++++++- > include/trace/events/btrfs.h | 1 + > 3 files changed, 20 insertions(+), 2 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 0c6d589c8ce4..8ccc5019172b 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -2750,7 +2750,8 @@ enum btrfs_flush_state { > FLUSH_DELALLOC = 5, > FLUSH_DELALLOC_WAIT = 6, > ALLOC_CHUNK = 7, > - COMMIT_TRANS = 8, > + ALLOC_CHUNK_FORCE = 8, > + COMMIT_TRANS = 9, > }; > > int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index a91b3183dcae..e6bb6ce23c84 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -4927,6 +4927,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, > btrfs_end_transaction(trans); > break; > case ALLOC_CHUNK: > + case ALLOC_CHUNK_FORCE: > trans = btrfs_join_transaction(root); > if (IS_ERR(trans)) { > ret = PTR_ERR(trans); > @@ -4934,7 +4935,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, > } > ret = do_chunk_alloc(trans, > btrfs_metadata_alloc_profile(fs_info), > - CHUNK_ALLOC_NO_FORCE); > + (state == ALLOC_CHUNK) ? > + CHUNK_ALLOC_NO_FORCE : > + CHUNK_ALLOC_FORCE); > btrfs_end_transaction(trans); > if (ret > 0 || ret == -ENOSPC) > ret = 0; > @@ -5070,6 +5073,19 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) > commit_cycles--; > } > > + /* > + * We don't want to force a chunk allocation until we've tried > + * pretty hard to reclaim space. Think of the case where we > + * free'd up a bunch of space and so have a lot of pinned space > + * to reclaim. We would rather use that than possibly create a > + * underutilized metadata chunk. So if this is our first run > + * through the flushing state machine skip ALLOC_CHUNK_FORCE and > + * commit the transaction. If nothing has changed the next go > + * around then we can force a chunk allocation. > + */ > + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) > + flush_state++; > + > if (flush_state > COMMIT_TRANS) { > commit_cycles++; > if (commit_cycles > 2) { > diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h > index 63d1f9d8b8c7..dd0e6f8d6b6e 100644 > --- a/include/trace/events/btrfs.h > +++ b/include/trace/events/btrfs.h > @@ -1051,6 +1051,7 @@ TRACE_EVENT(btrfs_trigger_flush, > { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ > { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ > { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ > + { ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \ > { COMMIT_TRANS, "COMMIT_TRANS"}) > > TRACE_EVENT(btrfs_flush_space, >
On 21.11.18 г. 21:03 ч., Josef Bacik wrote: > With the introduction of the per-inode block_rsv it became possible to > have really really large reservation requests made because of data > fragmentation. Since the ticket stuff assumed that we'd always have > relatively small reservation requests it just killed all tickets if we > were unable to satisfy the current request. However this is generally > not the case anymore. So fix this logic to instead see if we had a > ticket that we were able to give some reservation to, and if we were > continue the flushing loop again. Likewise we make the tickets use the > space_info_add_old_bytes() method of returning what reservation they did > receive in hopes that it could satisfy reservations down the line. The logic of the patch can be summarised as follows: If no progress is made for a ticket, then start fail all tickets until the first one that has progress made on its reservation (inclusive). In this case this first ticket will be failed but at least it's space will be reused via space_info_add_old_bytes. Frankly this seem really arbitrary. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > --- > fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++++-------------------- > 1 file changed, 25 insertions(+), 20 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index e6bb6ce23c84..983d086fa768 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -4791,6 +4791,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, > } > > struct reserve_ticket { > + u64 orig_bytes; > u64 bytes; > int error; > struct list_head list; > @@ -5012,7 +5013,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, > !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); > } > > -static void wake_all_tickets(struct list_head *head) > +static bool wake_all_tickets(struct list_head *head) > { > struct reserve_ticket *ticket; > > @@ -5021,7 +5022,10 @@ static void wake_all_tickets(struct list_head *head) > list_del_init(&ticket->list); > ticket->error = -ENOSPC; > wake_up(&ticket->wait); > + if (ticket->bytes != ticket->orig_bytes) > + return true; > } > + return false; > } > > /* > @@ -5089,8 +5093,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) > if (flush_state > COMMIT_TRANS) { > commit_cycles++; > if (commit_cycles > 2) { > - wake_all_tickets(&space_info->tickets); > - space_info->flush = 0; > + if (wake_all_tickets(&space_info->tickets)) { > + flush_state = FLUSH_DELAYED_ITEMS_NR; > + commit_cycles--; > + } else { > + space_info->flush = 0; > + } > } else { > flush_state = FLUSH_DELAYED_ITEMS_NR; > } > @@ -5142,10 +5150,11 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > > static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, > struct btrfs_space_info *space_info, > - struct reserve_ticket *ticket, u64 orig_bytes) > + struct reserve_ticket *ticket) > > { > DEFINE_WAIT(wait); > + u64 reclaim_bytes = 0; > int ret = 0; > > spin_lock(&space_info->lock); > @@ -5166,14 +5175,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, > ret = ticket->error; > if (!list_empty(&ticket->list)) > list_del_init(&ticket->list); > - if (ticket->bytes && ticket->bytes < orig_bytes) { > - u64 num_bytes = orig_bytes - ticket->bytes; > - update_bytes_may_use(space_info, -num_bytes); > - trace_btrfs_space_reservation(fs_info, "space_info", > - space_info->flags, num_bytes, 0); > - } > + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) > + reclaim_bytes = ticket->orig_bytes - ticket->bytes; > spin_unlock(&space_info->lock); > > + if (reclaim_bytes) > + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); > return ret; > } > > @@ -5199,6 +5206,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, > { > struct reserve_ticket ticket; > u64 used; > + u64 reclaim_bytes = 0; > int ret = 0; > > ASSERT(orig_bytes); > @@ -5234,6 +5242,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, > * the list and we will do our own flushing further down. > */ > if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { > + ticket.orig_bytes = orig_bytes; > ticket.bytes = orig_bytes; > ticket.error = 0; > init_waitqueue_head(&ticket.wait); > @@ -5274,25 +5283,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, > return ret; > > if (flush == BTRFS_RESERVE_FLUSH_ALL) > - return wait_reserve_ticket(fs_info, space_info, &ticket, > - orig_bytes); > + return wait_reserve_ticket(fs_info, space_info, &ticket); > > ret = 0; > priority_reclaim_metadata_space(fs_info, space_info, &ticket); > spin_lock(&space_info->lock); > if (ticket.bytes) { > - if (ticket.bytes < orig_bytes) { > - u64 num_bytes = orig_bytes - ticket.bytes; > - update_bytes_may_use(space_info, -num_bytes); > - trace_btrfs_space_reservation(fs_info, "space_info", > - space_info->flags, > - num_bytes, 0); > - > - } > + if (ticket.bytes < orig_bytes) > + reclaim_bytes = orig_bytes - ticket.bytes; > list_del_init(&ticket.list); > ret = -ENOSPC; > } > spin_unlock(&space_info->lock); > + > + if (reclaim_bytes) > + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); > ASSERT(list_empty(&ticket.list)); > return ret; > } >
On 21.11.18 г. 21:03 ч., Josef Bacik wrote: > For FLUSH_LIMIT flushers we really can only allocate chunks and flush > delayed inode items, everything else is problematic. I added a bunch of > new states and it lead to weirdness in the FLUSH_LIMIT case because I > forgot about how it worked. So instead explicitly declare the states > that are ok for flushing with FLUSH_LIMIT and use that for our state > machine. Then as we add new things that are safe we can just add them > to this list. Code-wise it's ok but the changelog needs rewording. At the very least explain the weirdness. Also in the last sentence the word 'thing' is better substituted with "flush states". > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > --- > fs/btrfs/extent-tree.c | 21 ++++++++++----------- > 1 file changed, 10 insertions(+), 11 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 0e9ba77e5316..e31980d451c2 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -5112,12 +5112,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) > INIT_WORK(work, btrfs_async_reclaim_metadata_space); > } > > +static const enum btrfs_flush_state priority_flush_states[] = { > + FLUSH_DELAYED_ITEMS_NR, > + FLUSH_DELAYED_ITEMS, > + ALLOC_CHUNK, > +}; > + > static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > struct btrfs_space_info *space_info, > struct reserve_ticket *ticket) > { > u64 to_reclaim; > - int flush_state = FLUSH_DELAYED_ITEMS_NR; > + int flush_state = 0; > > spin_lock(&space_info->lock); > to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, > @@ -5129,7 +5135,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > spin_unlock(&space_info->lock); > > do { > - flush_space(fs_info, space_info, to_reclaim, flush_state); > + flush_space(fs_info, space_info, to_reclaim, > + priority_flush_states[flush_state]); > flush_state++; > spin_lock(&space_info->lock); > if (ticket->bytes == 0) { > @@ -5137,15 +5144,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > return; > } > spin_unlock(&space_info->lock); > - > - /* > - * Priority flushers can't wait on delalloc without > - * deadlocking. > - */ > - if (flush_state == FLUSH_DELALLOC || > - flush_state == FLUSH_DELALLOC_WAIT) > - flush_state = ALLOC_CHUNK; > - } while (flush_state < COMMIT_TRANS); > + } while (flush_state < ARRAY_SIZE(priority_flush_states)); > } > > static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, >
On 26.11.18 г. 14:41 ч., Nikolay Borisov wrote: > > > On 21.11.18 г. 21:03 ч., Josef Bacik wrote: >> For FLUSH_LIMIT flushers we really can only allocate chunks and flush >> delayed inode items, everything else is problematic. I added a bunch of >> new states and it lead to weirdness in the FLUSH_LIMIT case because I >> forgot about how it worked. So instead explicitly declare the states >> that are ok for flushing with FLUSH_LIMIT and use that for our state >> machine. Then as we add new things that are safe we can just add them >> to this list. > > > Code-wise it's ok but the changelog needs rewording. At the very least > explain the weirdness. Also in the last sentence the word 'thing' is > better substituted with "flush states". Case in point, you yourself mention that you have forgotten how the FLUSH_LIMIT case works. That's why we need good changelogs so that those details can be quickly worked out from reading the changelog. > >> >> Signed-off-by: Josef Bacik <josef@toxicpanda.com> >> --- >> fs/btrfs/extent-tree.c | 21 ++++++++++----------- >> 1 file changed, 10 insertions(+), 11 deletions(-) >> >> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c >> index 0e9ba77e5316..e31980d451c2 100644 >> --- a/fs/btrfs/extent-tree.c >> +++ b/fs/btrfs/extent-tree.c >> @@ -5112,12 +5112,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) >> INIT_WORK(work, btrfs_async_reclaim_metadata_space); >> } >> >> +static const enum btrfs_flush_state priority_flush_states[] = { >> + FLUSH_DELAYED_ITEMS_NR, >> + FLUSH_DELAYED_ITEMS, >> + ALLOC_CHUNK, >> +}; >> + >> static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, >> struct btrfs_space_info *space_info, >> struct reserve_ticket *ticket) >> { >> u64 to_reclaim; >> - int flush_state = FLUSH_DELAYED_ITEMS_NR; >> + int flush_state = 0; >> >> spin_lock(&space_info->lock); >> to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, >> @@ -5129,7 +5135,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, >> spin_unlock(&space_info->lock); >> >> do { >> - flush_space(fs_info, space_info, to_reclaim, flush_state); >> + flush_space(fs_info, space_info, to_reclaim, >> + priority_flush_states[flush_state]); >> flush_state++; >> spin_lock(&space_info->lock); >> if (ticket->bytes == 0) { >> @@ -5137,15 +5144,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, >> return; >> } >> spin_unlock(&space_info->lock); >> - >> - /* >> - * Priority flushers can't wait on delalloc without >> - * deadlocking. >> - */ >> - if (flush_state == FLUSH_DELALLOC || >> - flush_state == FLUSH_DELALLOC_WAIT) >> - flush_state = ALLOC_CHUNK; >> - } while (flush_state < COMMIT_TRANS); >> + } while (flush_state < ARRAY_SIZE(priority_flush_states)); >> } >> >> static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, >> >
On Mon, Nov 26, 2018 at 02:25:52PM +0200, Nikolay Borisov wrote:
>
>
> On 21.11.18 г. 21:03 ч., Josef Bacik wrote:
> > With the introduction of the per-inode block_rsv it became possible to
> > have really really large reservation requests made because of data
> > fragmentation. Since the ticket stuff assumed that we'd always have
> > relatively small reservation requests it just killed all tickets if we
> > were unable to satisfy the current request. However this is generally
> > not the case anymore. So fix this logic to instead see if we had a
> > ticket that we were able to give some reservation to, and if we were
> > continue the flushing loop again. Likewise we make the tickets use the
> > space_info_add_old_bytes() method of returning what reservation they did
> > receive in hopes that it could satisfy reservations down the line.
>
>
> The logic of the patch can be summarised as follows:
>
> If no progress is made for a ticket, then start fail all tickets until
> the first one that has progress made on its reservation (inclusive). In
> this case this first ticket will be failed but at least it's space will
> be reused via space_info_add_old_bytes.
>
> Frankly this seem really arbitrary.
It's not though. The tickets are in order of who requested the reservation.
Because we will backfill reservations for things like hugely fragmented files or
large amounts of delayed refs we can have spikes where we're trying to reserve
100mb's of metadata space. We may fill 50mb of that before we run out of space.
Well so we can't satisfy that reservation, but the small 100k reservations that
are waiting to be serviced can be satisfied and they can run. The alternative
is you get ENOSPC and then you can turn around and touch a file no problem
because it's a small reservation and there was room for it. This patch enables
better behavior for the user. Thanks,
Josef
On 27.11.18 г. 21:46 ч., Josef Bacik wrote: > On Mon, Nov 26, 2018 at 02:25:52PM +0200, Nikolay Borisov wrote: >> >> >> On 21.11.18 г. 21:03 ч., Josef Bacik wrote: >>> With the introduction of the per-inode block_rsv it became possible to >>> have really really large reservation requests made because of data >>> fragmentation. Since the ticket stuff assumed that we'd always have >>> relatively small reservation requests it just killed all tickets if we >>> were unable to satisfy the current request. However this is generally >>> not the case anymore. So fix this logic to instead see if we had a >>> ticket that we were able to give some reservation to, and if we were >>> continue the flushing loop again. Likewise we make the tickets use the >>> space_info_add_old_bytes() method of returning what reservation they did >>> receive in hopes that it could satisfy reservations down the line. >> >> >> The logic of the patch can be summarised as follows: >> >> If no progress is made for a ticket, then start fail all tickets until >> the first one that has progress made on its reservation (inclusive). In >> this case this first ticket will be failed but at least it's space will >> be reused via space_info_add_old_bytes. >> >> Frankly this seem really arbitrary. > > It's not though. The tickets are in order of who requested the reservation. > Because we will backfill reservations for things like hugely fragmented files or > large amounts of delayed refs we can have spikes where we're trying to reserve > 100mb's of metadata space. We may fill 50mb of that before we run out of space. > Well so we can't satisfy that reservation, but the small 100k reservations that > are waiting to be serviced can be satisfied and they can run. The alternative > is you get ENOSPC and then you can turn around and touch a file no problem > because it's a small reservation and there was room for it. This patch enables > better behavior for the user. Thanks, Well this information needs to be in the changelog since it describe the situation where this patch is useful. > > Josef >
For FLUSH_LIMIT flushers (think evict, truncate) we can deadlock when running delalloc because we may be holding a tree lock. We can also deadlock with delayed refs rsv's that are running via the committing mechanism. The only safe operations for FLUSH_LIMIT is to run the delayed operations and to allocate chunks, everything else has the potential to deadlock. Future proof this by explicitly specifying the states that FLUSH_LIMIT is allowed to use. This will keep us from introducing bugs later on when adding new flush states. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/extent-tree.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0e1a499035ac..ab9d915d9289 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5123,12 +5123,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) INIT_WORK(work, btrfs_async_reclaim_metadata_space); } +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { u64 to_reclaim; - int flush_state = FLUSH_DELAYED_ITEMS_NR; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, @@ -5140,7 +5146,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -5148,15 +5155,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - - /* - * Priority flushers can't wait on delalloc without - * deadlocking. - */ - if (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT) - flush_state = ALLOC_CHUNK; - } while (flush_state < COMMIT_TRANS); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); } static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, -- 2.14.3
On Mon, Dec 03, 2018 at 10:24:58AM -0500, Josef Bacik wrote:
> For FLUSH_LIMIT flushers (think evict, truncate) we can deadlock when
> running delalloc because we may be holding a tree lock. We can also
> deadlock with delayed refs rsv's that are running via the committing
> mechanism. The only safe operations for FLUSH_LIMIT is to run the
> delayed operations and to allocate chunks, everything else has the
> potential to deadlock. Future proof this by explicitly specifying the
> states that FLUSH_LIMIT is allowed to use. This will keep us from
> introducing bugs later on when adding new flush states.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
On 3.12.18 г. 17:24 ч., Josef Bacik wrote: > For FLUSH_LIMIT flushers (think evict, truncate) we can deadlock when > running delalloc because we may be holding a tree lock. We can also > deadlock with delayed refs rsv's that are running via the committing > mechanism. The only safe operations for FLUSH_LIMIT is to run the > delayed operations and to allocate chunks, everything else has the > potential to deadlock. Future proof this by explicitly specifying the > states that FLUSH_LIMIT is allowed to use. This will keep us from > introducing bugs later on when adding new flush states. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> > --- > fs/btrfs/extent-tree.c | 21 ++++++++++----------- > 1 file changed, 10 insertions(+), 11 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 0e1a499035ac..ab9d915d9289 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -5123,12 +5123,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) > INIT_WORK(work, btrfs_async_reclaim_metadata_space); > } > > +static const enum btrfs_flush_state priority_flush_states[] = { > + FLUSH_DELAYED_ITEMS_NR, > + FLUSH_DELAYED_ITEMS, > + ALLOC_CHUNK, > +}; > + > static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > struct btrfs_space_info *space_info, > struct reserve_ticket *ticket) > { > u64 to_reclaim; > - int flush_state = FLUSH_DELAYED_ITEMS_NR; > + int flush_state = 0; > > spin_lock(&space_info->lock); > to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, > @@ -5140,7 +5146,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > spin_unlock(&space_info->lock); > > do { > - flush_space(fs_info, space_info, to_reclaim, flush_state); > + flush_space(fs_info, space_info, to_reclaim, > + priority_flush_states[flush_state]); > flush_state++; > spin_lock(&space_info->lock); > if (ticket->bytes == 0) { > @@ -5148,15 +5155,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > return; > } > spin_unlock(&space_info->lock); > - > - /* > - * Priority flushers can't wait on delalloc without > - * deadlocking. > - */ > - if (flush_state == FLUSH_DELALLOC || > - flush_state == FLUSH_DELALLOC_WAIT) > - flush_state = ALLOC_CHUNK; > - } while (flush_state < COMMIT_TRANS); > + } while (flush_state < ARRAY_SIZE(priority_flush_states)); > } > > static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, >