From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753209AbdGSHla (ORCPT ); Wed, 19 Jul 2017 03:41:30 -0400 Received: from mx2.suse.de ([195.135.220.15]:52704 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1752759AbdGSHk6 (ORCPT ); Wed, 19 Jul 2017 03:40:58 -0400 Date: Wed, 19 Jul 2017 09:40:56 +0200 From: Jan Kara To: Davidlohr Bueso Cc: akpm@linux-foundation.org, mingo@kernel.org, peterz@infradead.org, jack@suse.cz, torvalds@linux-foundation.org, kirill.shutemov@linux.intel.com, hch@infradead.org, ldufour@linux.vnet.ibm.com, mhocko@suse.com, mgorman@techsingularity.net, linux-kernel@vger.kernel.org, tytso@mit.edu, Davidlohr Bueso Subject: Re: [PATCH 15/17] fs/ext4: use cached rbtrees Message-ID: <20170719074056.GA12546@quack2.suse.cz> References: <20170719014603.19029-1-dave@stgolabs.net> <20170719014603.19029-16-dave@stgolabs.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170719014603.19029-16-dave@stgolabs.net> User-Agent: Mutt/1.5.24 (2015-08-30) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Tue 18-07-17 18:46:01, Davidlohr Bueso wrote: > This patch replaces two uses of rbtrees to the cached version for > better rb_first() performance during: > > * Tree traversals to mark blocks as used during ext4_mb_init_cache() > looping. (struct ext4_group_info) > > * Storing ->curr_node for dx_readdir(). (struct dir_private_info) > > Caching the leftmost node comes at negligible runtime cost, thus the > only downside is that the aforementioned structures are enlarged for > the extra pointer. No changes in semantics whatsoever. > > Cc: tytso@mit.edu > Signed-off-by: Davidlohr Bueso Hum, so I'm somewhat undecided whether this is worth the churn. For free blocks rb_tree we use rb_first() only in ext4_mb_generate_from_freelist() which gets called only when generating new buddy bitmap from on-disk bitmap and we traverse the whole tree after that - thus the extra cost of rb_first() is a) well hidden in the total cost of iteration, b) rather rare anyway. Similarly for the H-tree directory code, we call rb_first() in ext4_dx_readdir() only to start an iteration over the whole B-tree and in such case I don't think optimizing rb_first() makes a big difference (maintaining cached value is going to have about the same cost as we save by using it). Honza > --- > This is part of the rbtree internal caching series: > https://lwn.net/Articles/726809/ > > fs/ext4/dir.c | 24 ++++++++++++++---------- > fs/ext4/ext4.h | 4 ++-- > fs/ext4/mballoc.c | 22 ++++++++++++---------- > 3 files changed, 28 insertions(+), 22 deletions(-) > > diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c > index e8b365000d73..011df6bba13c 100644 > --- a/fs/ext4/dir.c > +++ b/fs/ext4/dir.c > @@ -391,18 +391,18 @@ struct fname { > * This functoin implements a non-recursive way of freeing all of the > * nodes in the red-black tree. > */ > -static void free_rb_tree_fname(struct rb_root *root) > +static void free_rb_tree_fname(struct rb_root_cached *root) > { > struct fname *fname, *next; > > - rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) > + rbtree_postorder_for_each_entry_safe(fname, next, &root->rb_root, rb_hash) > while (fname) { > struct fname *old = fname; > fname = fname->next; > kfree(old); > } > > - *root = RB_ROOT; > + *root = RB_ROOT_CACHED; > } > > > @@ -441,9 +441,10 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, > struct fname *fname, *new_fn; > struct dir_private_info *info; > int len; > + bool leftmost = true; > > info = dir_file->private_data; > - p = &info->root.rb_node; > + p = &info->root.rb_root.rb_node; > > /* Create and allocate the fname structure */ > len = sizeof(struct fname) + ent_name->len + 1; > @@ -475,16 +476,19 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, > > if (new_fn->hash < fname->hash) > p = &(*p)->rb_left; > - else if (new_fn->hash > fname->hash) > + else if (new_fn->hash > fname->hash) { > p = &(*p)->rb_right; > - else if (new_fn->minor_hash < fname->minor_hash) > + leftmost = false; > + } else if (new_fn->minor_hash < fname->minor_hash) > p = &(*p)->rb_left; > - else /* if (new_fn->minor_hash > fname->minor_hash) */ > + else { /* if (new_fn->minor_hash > fname->minor_hash) */ > p = &(*p)->rb_right; > + leftmost = false; > + } > } > > rb_link_node(&new_fn->rb_hash, parent, p); > - rb_insert_color(&new_fn->rb_hash, &info->root); > + rb_insert_color_cached(&new_fn->rb_hash, &info->root, leftmost); > return 0; > } > > @@ -558,7 +562,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) > info->extra_fname = NULL; > goto next_node; > } else if (!info->curr_node) > - info->curr_node = rb_first(&info->root); > + info->curr_node = rb_first_cached(&info->root); > > while (1) { > /* > @@ -580,7 +584,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) > ctx->pos = ext4_get_htree_eof(file); > break; > } > - info->curr_node = rb_first(&info->root); > + info->curr_node = rb_first_cached(&info->root); > } > > fname = rb_entry(info->curr_node, struct fname, rb_hash); > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 9ebde0cd632e..0242c6801600 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2116,7 +2116,7 @@ static inline bool ext4_is_quota_file(struct inode *inode) > * readdir operations in hash tree order. > */ > struct dir_private_info { > - struct rb_root root; > + struct rb_root_cached root; > struct rb_node *curr_node; > struct fname *extra_fname; > loff_t last_pos; > @@ -2883,7 +2883,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, > > struct ext4_group_info { > unsigned long bb_state; > - struct rb_root bb_free_root; > + struct rb_root_cached bb_free_root; > ext4_grpblk_t bb_first_free; /* first free block */ > ext4_grpblk_t bb_free; /* total free blocks */ > ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index 581e357e8406..df36f9123a99 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -2462,7 +2462,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, > > INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); > init_rwsem(&meta_group_info[i]->alloc_sem); > - meta_group_info[i]->bb_free_root = RB_ROOT; > + meta_group_info[i]->bb_free_root = RB_ROOT_CACHED; > meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ > > #ifdef DOUBLE_CHECK > @@ -2824,7 +2824,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, > count2++; > ext4_lock_group(sb, entry->efd_group); > /* Take it out of per group rb tree */ > - rb_erase(&entry->efd_node, &(db->bb_free_root)); > + rb_erase_cached(&entry->efd_node, &(db->bb_free_root)); > mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); > > /* > @@ -2836,7 +2836,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, > if (!test_opt(sb, DISCARD)) > EXT4_MB_GRP_CLEAR_TRIMMED(db); > > - if (!db->bb_free_root.rb_node) { > + if (!db->bb_free_root.rb_root.rb_node) { > /* No more items in the per group rb tree > * balance refcounts from ext4_mb_free_metadata() > */ > @@ -3519,7 +3519,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, > struct ext4_free_data *entry; > > grp = ext4_get_group_info(sb, group); > - n = rb_first(&(grp->bb_free_root)); > + n = rb_first_cached(&(grp->bb_free_root)); > > while (n) { > entry = rb_entry(n, struct ext4_free_data, efd_node); > @@ -4624,7 +4624,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, > static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, > struct ext4_free_data *entry, > struct ext4_free_data *new_entry, > - struct rb_root *entry_rb_root) > + struct rb_root_cached *entry_rb_root) > { > if ((entry->efd_tid != new_entry->efd_tid) || > (entry->efd_group != new_entry->efd_group)) > @@ -4641,7 +4641,7 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, > spin_lock(&sbi->s_md_lock); > list_del(&entry->efd_list); > spin_unlock(&sbi->s_md_lock); > - rb_erase(&entry->efd_node, entry_rb_root); > + rb_erase_cached(&entry->efd_node, entry_rb_root); > kmem_cache_free(ext4_free_data_cachep, entry); > } > > @@ -4656,8 +4656,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, > struct ext4_group_info *db = e4b->bd_info; > struct super_block *sb = e4b->bd_sb; > struct ext4_sb_info *sbi = EXT4_SB(sb); > - struct rb_node **n = &db->bb_free_root.rb_node, *node; > + struct rb_node **n = &db->bb_free_root.rb_root.rb_node, *node; > struct rb_node *parent = NULL, *new_node; > + bool leftmost = true; > > BUG_ON(!ext4_handle_valid(handle)); > BUG_ON(e4b->bd_bitmap_page == NULL); > @@ -4680,9 +4681,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, > entry = rb_entry(parent, struct ext4_free_data, efd_node); > if (cluster < entry->efd_start_cluster) > n = &(*n)->rb_left; > - else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) > + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) { > n = &(*n)->rb_right; > - else { > + leftmost = false; > + } else { > ext4_grp_locked_error(sb, group, 0, > ext4_group_first_block_no(sb, group) + > EXT4_C2B(sbi, cluster), > @@ -4692,7 +4694,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, > } > > rb_link_node(new_node, parent, n); > - rb_insert_color(new_node, &db->bb_free_root); > + rb_insert_color_cached(new_node, &db->bb_free_root, leftmost); > > /* Now try to see the extent can be merged to left and right */ > node = rb_prev(new_node); > -- > 2.12.0 > -- Jan Kara SUSE Labs, CR