All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] f2fs: add extent cache base on rb-tree
@ 2014-12-19 10:49 Chao Yu
  2014-12-22  2:03 ` Changman Lee
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-19 10:49 UTC (permalink / raw)
  To: Jaegeuk Kim, Changman Lee; +Cc: linux-f2fs-devel, linux-kernel

Now f2fs have page-block mapping cache which can cache only one extent mapping
between contiguous logical address and physical address.
Normally, this design will work well because f2fs will expand coverage area of
the mapping extent when we write forward sequentially. But when we write data
randomly in Out-Place-Update mode, the extent will be shorten and hardly be
expanded for most time as following reasons:
1.The short part of extent will be discarded if we break contiguous mapping in
the middle of extent.
2.The new mapping will be added into mapping cache only at head or tail of the
extent.
3.We will drop the extent cache when the extent became very fragmented.
4.We will not update the extent with mapping which we get from readpages or
readpage.

To solve above problems, this patch adds extent cache base on rb-tree like other
filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
more effective cache between dnode page cache and disk. It will supply high hit
ratio in the cache with fewer memory when dnode page cache are reclaimed in
environment of low memory.

Todo:
*introduce mount option for extent cache.
*add shrink ability for extent cache.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
---
 fs/f2fs/data.c  | 348 +++++++++++++++++++++++++++++++++++++++++---------------
 fs/f2fs/debug.c |   2 +
 fs/f2fs/f2fs.h  |  49 ++++----
 fs/f2fs/inode.c |   5 +-
 fs/f2fs/super.c |  11 +-
 5 files changed, 291 insertions(+), 124 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b..20592e2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,8 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
 
+struct kmem_cache *extent_info_cache;
+
 static void f2fs_read_end_io(struct bio *bio, int err)
 {
 	struct bio_vec *bvec;
@@ -247,126 +249,264 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	return err;
 }
 
-static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
-					struct buffer_head *bh_result)
+static struct extent_info *__insert_extent_cache(struct inode *inode,
+				unsigned int fofs, unsigned int len, u32 blk)
 {
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	pgoff_t start_fofs, end_fofs;
-	block_t start_blkaddr;
-
-	if (is_inode_flag_set(fi, FI_NO_EXTENT))
-		return 0;
-
-	read_lock(&fi->ext.ext_lock);
-	if (fi->ext.len == 0) {
-		read_unlock(&fi->ext.ext_lock);
-		return 0;
+	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
+	struct rb_node *p = root->rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_info *ei;
+
+	while (p) {
+		parent = p;
+		ei = rb_entry(parent, struct extent_info, rb_node);
+
+		if (fofs < ei->fofs)
+			p = p->rb_left;
+		else if (fofs >= ei->fofs + ei->len)
+			p = p->rb_right;
+		else
+			f2fs_bug_on(F2FS_I_SB(inode), 1);
 	}
 
-	stat_inc_total_hit(inode->i_sb);
+	ei = kmem_cache_alloc(extent_info_cache, GFP_ATOMIC);
+	ei->fofs = fofs;
+	ei->blk = blk;
+	ei->len = len;
+
+	rb_link_node(&ei->rb_node, parent, &p);
+	rb_insert_color(&ei->rb_node, root);
+	stat_inc_extent_count(inode->i_sb);
+	return ei;
+}
 
-	start_fofs = fi->ext.fofs;
-	end_fofs = fi->ext.fofs + fi->ext.len - 1;
-	start_blkaddr = fi->ext.blk_addr;
+static bool __remove_extent_cache(struct inode *inode, unsigned int fofs,
+							struct extent_info *cei)
+{
+	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
+	struct rb_node *p = root->rb_node;
+	struct extent_info *ei;
 
-	if (pgofs >= start_fofs && pgofs <= end_fofs) {
-		unsigned int blkbits = inode->i_sb->s_blocksize_bits;
-		size_t count;
+	while (p) {
+		ei = rb_entry(p, struct extent_info, rb_node);
 
-		clear_buffer_new(bh_result);
-		map_bh(bh_result, inode->i_sb,
-				start_blkaddr + pgofs - start_fofs);
-		count = end_fofs - pgofs + 1;
-		if (count < (UINT_MAX >> blkbits))
-			bh_result->b_size = (count << blkbits);
+		if (fofs < ei->fofs)
+			p = p->rb_left;
+		else if (fofs >= ei->fofs + ei->len)
+			p = p->rb_right;
 		else
-			bh_result->b_size = UINT_MAX;
+			goto found;
+	}
+	return true;
+found:
+	ei = rb_entry(p, struct extent_info, rb_node);
+	cei->fofs = ei->fofs;
+	cei->blk = ei->blk;
+	cei->len = ei->len;
+	rb_erase(&ei->rb_node, root);
+	kmem_cache_free(extent_info_cache, ei);
+	stat_dec_extent_count(inode->i_sb);
+	return false;
+}
 
-		stat_inc_read_hit(inode->i_sb);
-		read_unlock(&fi->ext.ext_lock);
-		return 1;
+static void __try_merge_extent(struct inode *inode, struct extent_info *ei)
+{
+	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
+	struct extent_info *pei = NULL;
+	struct rb_node *node;
+
+	node = rb_prev(&ei->rb_node);
+	if (node) {
+		pei = rb_entry(node, struct extent_info, rb_node);
+		if (ei->blk == pei->blk + pei->len) {
+			ei->fofs = pei->fofs;
+			ei->blk = pei->blk;
+			ei->len += pei->len;
+			rb_erase(&pei->rb_node, root);
+			kmem_cache_free(extent_info_cache, pei);
+			stat_dec_extent_count(inode->i_sb);
+		}
+	}
+
+	node = rb_next(&ei->rb_node);
+	if (node) {
+		pei = rb_entry(node, struct extent_info, rb_node);
+		if (ei->blk + 1 == pei->blk) {
+			ei->len += pei->len;
+			rb_erase(&pei->rb_node, root);
+			kmem_cache_free(extent_info_cache, pei);
+			stat_dec_extent_count(inode->i_sb);
+		}
 	}
-	read_unlock(&fi->ext.ext_lock);
-	return 0;
 }
 
-void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+inline void get_extent_info(struct inode *inode, struct f2fs_extent *i_ext)
 {
-	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
-	pgoff_t fofs, start_fofs, end_fofs;
-	block_t start_blkaddr, end_blkaddr;
-	int need_update = true;
+	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
+	struct extent_info *ei;
+
+	write_lock(&tree->ei_lock);
+	ei = __insert_extent_cache(inode, le32_to_cpu(i_ext->fofs),
+			le32_to_cpu(i_ext->len), le32_to_cpu(i_ext->blk_addr));
+	tree->cached_ei = ei;
+	write_unlock(&tree->ei_lock);
+}
 
-	f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
-	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-							dn->ofs_in_node;
+inline void set_raw_extent(struct f2fs_ei_tree *tree,
+					struct f2fs_extent *i_ext)
+{
+	struct extent_info *ei = tree->cached_ei;
 
-	/* Update the page address in the parent node */
-	__set_data_blkaddr(dn, blk_addr);
+	read_lock(&tree->ei_lock);
+	if (ei) {
+		i_ext->fofs = cpu_to_le32(ei->fofs);
+		i_ext->blk_addr = cpu_to_le32(ei->blk);
+		i_ext->len = cpu_to_le32(ei->len);
+	}
+	read_unlock(&tree->ei_lock);
+}
 
-	if (is_inode_flag_set(fi, FI_NO_EXTENT))
-		return;
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+							struct extent_info *ei)
+{
+	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
+	struct rb_node *node;
+	struct extent_info *pei;
 
-	write_lock(&fi->ext.ext_lock);
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		return false;
 
-	start_fofs = fi->ext.fofs;
-	end_fofs = fi->ext.fofs + fi->ext.len - 1;
-	start_blkaddr = fi->ext.blk_addr;
-	end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+	read_lock(&tree->ei_lock);
 
-	/* Drop and initialize the matched extent */
-	if (fi->ext.len == 1 && fofs == start_fofs)
-		fi->ext.len = 0;
+	stat_inc_total_hit(inode->i_sb);
 
-	/* Initial extent */
-	if (fi->ext.len == 0) {
-		if (blk_addr != NULL_ADDR) {
-			fi->ext.fofs = fofs;
-			fi->ext.blk_addr = blk_addr;
-			fi->ext.len = 1;
-		}
-		goto end_update;
+	/* #1: find recently accessed extent info firstly */
+	if (tree->cached_ei) {
+		pei = tree->cached_ei;
+		if (pgofs >= pei->fofs && pgofs < pei->fofs + pei->len)
+			goto found;
 	}
 
-	/* Front merge */
-	if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
-		fi->ext.fofs--;
-		fi->ext.blk_addr--;
-		fi->ext.len++;
-		goto end_update;
+	/* #2: find in rb tree */
+	node = tree->root.rb_node;
+	while (node) {
+		pei = rb_entry(node, struct extent_info, rb_node);
+		if (pgofs < pei->fofs)
+			node = node->rb_left;
+		else if (pgofs >= pei->fofs + pei->len)
+			node = node->rb_right;
+		else
+			goto found;
 	}
 
-	/* Back merge */
-	if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
-		fi->ext.len++;
-		goto end_update;
-	}
+	read_unlock(&tree->ei_lock);
+	return false;
+found:
+	ei->fofs = pei->fofs;
+	ei->blk = pei->blk;
+	ei->len = pei->len;
+	stat_inc_read_hit(inode->i_sb);
+	read_unlock(&tree->ei_lock);
+	return true;
+}
 
-	/* Split the existing extent */
-	if (fi->ext.len > 1 &&
-		fofs >= start_fofs && fofs <= end_fofs) {
-		if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
-			fi->ext.len = fofs - start_fofs;
-		} else {
-			fi->ext.fofs = fofs + 1;
-			fi->ext.blk_addr = start_blkaddr +
-					fofs - start_fofs + 1;
-			fi->ext.len -= fofs - start_fofs + 1;
-		}
-	} else {
-		need_update = false;
+void f2fs_update_extent_cache(struct inode *inode, pgoff_t fofs,
+							block_t blk_addr)
+{
+	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
+	struct extent_info ei;
+	struct extent_info *pei = NULL;
+	unsigned int endofs;
+
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		return;
+
+	if (blk_addr == NEW_ADDR)
+		return;
+
+	write_lock(&tree->ei_lock);
+
+	/* update old extent mapping */
+	if (__remove_extent_cache(inode, fofs, &ei))
+		goto add_extent;
+	if (ei.len == 1)
+		goto add_extent;
+
+	if (ei.fofs < fofs)
+		__insert_extent_cache(inode, ei.fofs, fofs - ei.fofs, ei.blk);
+
+	endofs = ei.fofs + ei.len - 1;
+	if (endofs > fofs)
+		__insert_extent_cache(inode, fofs + 1, endofs - fofs,
+						fofs - ei.fofs + ei.blk);
+
+add_extent:
+	/* insert new mapping extent to rb tree */
+	if (blk_addr) {
+		pei = __insert_extent_cache(inode, fofs, 1, blk_addr);
+		__try_merge_extent(inode, pei);
 	}
 
-	/* Finally, if the extent is very fragmented, let's drop the cache. */
-	if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
-		fi->ext.len = 0;
-		set_inode_flag(fi, FI_NO_EXTENT);
-		need_update = true;
+	if (pei)
+		tree->cached_ei = pei;
+
+	write_unlock(&tree->ei_lock);
+}
+
+void f2fs_free_extent_cache(struct inode *inode)
+{
+	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
+	struct rb_node *node, *next;
+	struct extent_info *ei;
+
+	write_lock(&tree->ei_lock);
+	node = rb_first(&tree->root);
+	while (node) {
+		next = rb_next(node);
+		ei = rb_entry(node, struct extent_info, rb_node);
+		rb_erase(node, &tree->root);
+		kmem_cache_free(extent_info_cache, ei);
+		stat_dec_extent_count(inode->i_sb);
+		node = next;
 	}
-end_update:
-	write_unlock(&fi->ext.ext_lock);
-	if (need_update)
-		sync_inode_page(dn);
+	write_unlock(&tree->ei_lock);
+}
+
+static void f2fs_map_bh(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei, struct buffer_head *bh_result)
+{
+	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+	pgoff_t start_fofs, end_fofs;
+	block_t start_blkaddr;
+	size_t count;
+
+	start_fofs = ei->fofs;
+	end_fofs = ei->fofs + ei->len - 1;
+	start_blkaddr = ei->blk;
+
+	clear_buffer_new(bh_result);
+	map_bh(bh_result, inode->i_sb,
+			start_blkaddr + pgofs - start_fofs);
+	count = end_fofs - pgofs + 1;
+	if (count < (UINT_MAX >> blkbits))
+		bh_result->b_size = (count << blkbits);
+	else
+		bh_result->b_size = UINT_MAX;
+}
+
+void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+{
+	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+	pgoff_t fofs;
+
+	f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+							dn->ofs_in_node;
+
+	/* Update the page address in the parent node */
+	__set_data_blkaddr(dn, blk_addr);
+
+	f2fs_update_extent_cache(dn->inode, fofs, blk_addr);
 	return;
 }
 
@@ -603,12 +743,15 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	pgoff_t pgofs, end_offset;
 	int err = 0, ofs = 1;
 	bool allocated = false;
+	struct extent_info ei;
 
 	/* Get the page offset from the block offset(iblock) */
 	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
 
-	if (check_extent_cache(inode, pgofs, bh_result))
+	if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+		f2fs_map_bh(inode, pgofs, &ei, bh_result);
 		goto out;
+	}
 
 	if (create) {
 		f2fs_balance_fs(F2FS_I_SB(inode));
@@ -628,6 +771,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 
 	if (dn.data_blkaddr != NULL_ADDR) {
 		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+		f2fs_update_extent_cache(inode, pgofs, dn.data_blkaddr);
 	} else if (create) {
 		err = __allocate_data_block(&dn);
 		if (err)
@@ -672,6 +816,8 @@ get_next:
 			allocated = true;
 			blkaddr = dn.data_blkaddr;
 		}
+		if (!allocated)
+			f2fs_update_extent_cache(inode, pgofs, blkaddr);
 		/* Give more consecutive addresses for the readahead */
 		if (blkaddr == (bh_result->b_blocknr + ofs)) {
 			ofs++;
@@ -1160,6 +1306,20 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, get_data_block);
 }
 
+int __init create_extent_cache(void)
+{
+	extent_info_cache = f2fs_kmem_cache_create("f2fs_extent_info_cache",
+			sizeof(struct extent_info));
+	if (!extent_info_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void destroy_extent_cache(void)
+{
+	kmem_cache_destroy(extent_info_cache);
+}
+
 const struct address_space_operations f2fs_dblock_aops = {
 	.readpage	= f2fs_read_data_page,
 	.readpages	= f2fs_read_data_pages,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f69..115cec0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -35,6 +35,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	/* validation check of the segment numbers */
 	si->hit_ext = sbi->read_hit_ext;
 	si->total_ext = sbi->total_hit_ext;
+	si->ext_count = sbi->total_ext_count;
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_dirs = sbi->n_dirty_dirs;
@@ -249,6 +250,7 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - node blocks : %d\n", si->node_blks);
 		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
 			   si->hit_ext, si->total_ext);
+		seq_printf(s, "\nExtent Node Count: %d\n", si->ext_count);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - inmem: %4d\n",
 			   si->inmem_pages);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2..dab6fdb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -263,10 +263,16 @@ enum {
 #define F2FS_MIN_EXTENT_LEN	16	/* minimum extent length */
 
 struct extent_info {
-	rwlock_t ext_lock;	/* rwlock for consistency */
-	unsigned int fofs;	/* start offset in a file */
-	u32 blk_addr;		/* start block address of the extent */
-	unsigned int len;	/* length of the extent */
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	unsigned int fofs;		/* start offset in a file */
+	u32 blk;			/* start block address of the extent */
+	unsigned int len;		/* length of the extent */
+};
+
+struct f2fs_ei_tree {
+	struct rb_root root;		/* root of extent info rb-tree */
+	struct extent_info *cached_ei;	/* recently accessed extent */
+	rwlock_t ei_lock;		/* rwlock for consistency */
 };
 
 /*
@@ -294,7 +300,8 @@ struct f2fs_inode_info {
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
-	struct extent_info ext;		/* in-memory extent cache entry */
+	struct extent_info ext;
+	struct f2fs_ei_tree ei_tree;	/* in-memory extent cache entry */
 	struct dir_inode_entry *dirty_dir;	/* the pointer of dirty dir */
 
 	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
@@ -302,26 +309,6 @@ struct f2fs_inode_info {
 	struct mutex inmem_lock;	/* lock for inmemory pages */
 };
 
-static inline void get_extent_info(struct extent_info *ext,
-					struct f2fs_extent i_ext)
-{
-	write_lock(&ext->ext_lock);
-	ext->fofs = le32_to_cpu(i_ext.fofs);
-	ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
-	ext->len = le32_to_cpu(i_ext.len);
-	write_unlock(&ext->ext_lock);
-}
-
-static inline void set_raw_extent(struct extent_info *ext,
-					struct f2fs_extent *i_ext)
-{
-	read_lock(&ext->ext_lock);
-	i_ext->fofs = cpu_to_le32(ext->fofs);
-	i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
-	i_ext->len = cpu_to_le32(ext->len);
-	read_unlock(&ext->ext_lock);
-}
-
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
@@ -590,6 +577,7 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
+	int total_ext_count;                    /* extent cache node count */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
 	int bg_gc;				/* background gc calls */
@@ -1462,12 +1450,17 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
 						struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
+void set_raw_extent(struct f2fs_ei_tree *, struct f2fs_extent *);
+void get_extent_info(struct inode *, struct f2fs_extent *);
+void f2fs_free_extent_cache(struct inode *);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct page *, struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
 
 /*
  * gc.c
@@ -1495,7 +1488,7 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	int hit_ext, total_ext;
+	int hit_ext, total_ext, ext_count;
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, sits, fnids;
 	int total_count, utilization;
@@ -1529,6 +1522,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
 #define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_extent_count(sb)	((F2FS_SB(sb))->total_ext_count++)
+#define stat_dec_extent_count(sb)	((F2FS_SB(sb))->total_ext_count--)
 #define stat_inc_inline_inode(inode)					\
 	do {								\
 		if (f2fs_has_inline_data(inode))			\
@@ -1593,6 +1588,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_extent_count(sb)
+#define stat_dec_extent_count(sb)
 #define stat_inc_inline_inode(inode)
 #define stat_dec_inline_inode(inode)
 #define stat_inc_inline_dir(inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc78..ae0dd9b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -137,7 +137,7 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	get_extent_info(&fi->ext, ri->i_ext);
+	get_extent_info(inode, &ri->i_ext);
 	get_inline_info(fi, ri);
 
 	/* check data exist */
@@ -227,7 +227,7 @@ void update_inode(struct inode *inode, struct page *node_page)
 	ri->i_links = cpu_to_le32(inode->i_nlink);
 	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(inode->i_blocks);
-	set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+	set_raw_extent(&F2FS_I(inode)->ei_tree, &ri->i_ext);
 	set_raw_inline(F2FS_I(inode), ri);
 
 	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -335,6 +335,7 @@ void f2fs_evict_inode(struct inode *inode)
 no_delete:
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
+	f2fs_free_extent_cache(inode);
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d..f205493 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -381,7 +381,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
-	rwlock_init(&fi->ext.ext_lock);
+	fi->ei_tree.root = RB_ROOT;
+	fi->ei_tree.cached_ei = NULL;
+	rwlock_init(&fi->ei_tree.ei_lock);
 	init_rwsem(&fi->i_sem);
 	INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
 	INIT_LIST_HEAD(&fi->inmem_pages);
@@ -1235,10 +1237,13 @@ static int __init init_f2fs_fs(void)
 	err = create_checkpoint_caches();
 	if (err)
 		goto free_gc_caches;
+	err = create_extent_cache();
+	if (err)
+		goto free_checkpoint_caches;
 	f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
 	if (!f2fs_kset) {
 		err = -ENOMEM;
-		goto free_checkpoint_caches;
+		goto free_extent_cache;
 	}
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
@@ -1249,6 +1254,8 @@ static int __init init_f2fs_fs(void)
 
 free_kset:
 	kset_unregister(f2fs_kset);
+free_extent_cache:
+	destroy_extent_cache();
 free_checkpoint_caches:
 	destroy_checkpoint_caches();
 free_gc_caches:
-- 
2.1.2



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-19 10:49 [RFC PATCH] f2fs: add extent cache base on rb-tree Chao Yu
@ 2014-12-22  2:03 ` Changman Lee
  2014-12-22  7:10   ` Chao Yu
  2014-12-22  9:06   ` Chao Yu
  0 siblings, 2 replies; 26+ messages in thread
From: Changman Lee @ 2014-12-22  2:03 UTC (permalink / raw)
  To: Chao Yu; +Cc: Jaegeuk Kim, linux-f2fs-devel, linux-kernel

Hi Yu,

Good approach.
As you know, however, f2fs breaks extent itself due to COW.
Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
So we would have lots of extents per inode and it could lead to overhead
to manage extents.

Anyway, mount option could be alternative for this patch.

On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> Now f2fs have page-block mapping cache which can cache only one extent mapping
> between contiguous logical address and physical address.
> Normally, this design will work well because f2fs will expand coverage area of
> the mapping extent when we write forward sequentially. But when we write data
> randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> expanded for most time as following reasons:
> 1.The short part of extent will be discarded if we break contiguous mapping in
> the middle of extent.
> 2.The new mapping will be added into mapping cache only at head or tail of the
> extent.
> 3.We will drop the extent cache when the extent became very fragmented.
> 4.We will not update the extent with mapping which we get from readpages or
> readpage.
> 
> To solve above problems, this patch adds extent cache base on rb-tree like other
> filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> more effective cache between dnode page cache and disk. It will supply high hit
> ratio in the cache with fewer memory when dnode page cache are reclaimed in
> environment of low memory.
> 
> Todo:
> *introduce mount option for extent cache.
> *add shrink ability for extent cache.
> 
> Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> ---
>  fs/f2fs/data.c  | 348 +++++++++++++++++++++++++++++++++++++++++---------------
>  fs/f2fs/debug.c |   2 +
>  fs/f2fs/f2fs.h  |  49 ++++----
>  fs/f2fs/inode.c |   5 +-
>  fs/f2fs/super.c |  11 +-
>  5 files changed, 291 insertions(+), 124 deletions(-)
> 
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 7ec697b..20592e2 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -24,6 +24,8 @@
>  #include "segment.h"
>  #include <trace/events/f2fs.h>
>  
> +struct kmem_cache *extent_info_cache;
> +
>  static void f2fs_read_end_io(struct bio *bio, int err)
>  {
>  	struct bio_vec *bvec;
> @@ -247,126 +249,264 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
>  	return err;
>  }
>  
> -static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
> -					struct buffer_head *bh_result)
> +static struct extent_info *__insert_extent_cache(struct inode *inode,
> +				unsigned int fofs, unsigned int len, u32 blk)
>  {
> -	struct f2fs_inode_info *fi = F2FS_I(inode);
> -	pgoff_t start_fofs, end_fofs;
> -	block_t start_blkaddr;
> -
> -	if (is_inode_flag_set(fi, FI_NO_EXTENT))
> -		return 0;
> -
> -	read_lock(&fi->ext.ext_lock);
> -	if (fi->ext.len == 0) {
> -		read_unlock(&fi->ext.ext_lock);
> -		return 0;
> +	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
> +	struct rb_node *p = root->rb_node;
> +	struct rb_node *parent = NULL;
> +	struct extent_info *ei;
> +
> +	while (p) {
> +		parent = p;
> +		ei = rb_entry(parent, struct extent_info, rb_node);
> +
> +		if (fofs < ei->fofs)
> +			p = p->rb_left;
> +		else if (fofs >= ei->fofs + ei->len)
> +			p = p->rb_right;
> +		else
> +			f2fs_bug_on(F2FS_I_SB(inode), 1);
>  	}
>  
> -	stat_inc_total_hit(inode->i_sb);
> +	ei = kmem_cache_alloc(extent_info_cache, GFP_ATOMIC);
> +	ei->fofs = fofs;
> +	ei->blk = blk;
> +	ei->len = len;
> +
> +	rb_link_node(&ei->rb_node, parent, &p);
> +	rb_insert_color(&ei->rb_node, root);
> +	stat_inc_extent_count(inode->i_sb);
> +	return ei;
> +}
>  
> -	start_fofs = fi->ext.fofs;
> -	end_fofs = fi->ext.fofs + fi->ext.len - 1;
> -	start_blkaddr = fi->ext.blk_addr;
> +static bool __remove_extent_cache(struct inode *inode, unsigned int fofs,
> +							struct extent_info *cei)
> +{
> +	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
> +	struct rb_node *p = root->rb_node;
> +	struct extent_info *ei;
>  
> -	if (pgofs >= start_fofs && pgofs <= end_fofs) {
> -		unsigned int blkbits = inode->i_sb->s_blocksize_bits;
> -		size_t count;
> +	while (p) {
> +		ei = rb_entry(p, struct extent_info, rb_node);
>  
> -		clear_buffer_new(bh_result);
> -		map_bh(bh_result, inode->i_sb,
> -				start_blkaddr + pgofs - start_fofs);
> -		count = end_fofs - pgofs + 1;
> -		if (count < (UINT_MAX >> blkbits))
> -			bh_result->b_size = (count << blkbits);
> +		if (fofs < ei->fofs)
> +			p = p->rb_left;
> +		else if (fofs >= ei->fofs + ei->len)
> +			p = p->rb_right;
>  		else
> -			bh_result->b_size = UINT_MAX;
> +			goto found;
> +	}
> +	return true;
> +found:
> +	ei = rb_entry(p, struct extent_info, rb_node);
> +	cei->fofs = ei->fofs;
> +	cei->blk = ei->blk;
> +	cei->len = ei->len;
> +	rb_erase(&ei->rb_node, root);
> +	kmem_cache_free(extent_info_cache, ei);
> +	stat_dec_extent_count(inode->i_sb);
> +	return false;
> +}
>  
> -		stat_inc_read_hit(inode->i_sb);
> -		read_unlock(&fi->ext.ext_lock);
> -		return 1;
> +static void __try_merge_extent(struct inode *inode, struct extent_info *ei)
> +{
> +	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
> +	struct extent_info *pei = NULL;
> +	struct rb_node *node;
> +
> +	node = rb_prev(&ei->rb_node);
> +	if (node) {
> +		pei = rb_entry(node, struct extent_info, rb_node);
> +		if (ei->blk == pei->blk + pei->len) {

Shouldn't we check below together, too?
if (ei->fofs == pei->fofs + pei->len)

> +			ei->fofs = pei->fofs;
> +			ei->blk = pei->blk;
> +			ei->len += pei->len;
> +			rb_erase(&pei->rb_node, root);
> +			kmem_cache_free(extent_info_cache, pei);
> +			stat_dec_extent_count(inode->i_sb);
> +		}
> +	}
> +
> +	node = rb_next(&ei->rb_node);
> +	if (node) {
> +		pei = rb_entry(node, struct extent_info, rb_node);
> +		if (ei->blk + 1 == pei->blk) {
> +			ei->len += pei->len;
> +			rb_erase(&pei->rb_node, root);
> +			kmem_cache_free(extent_info_cache, pei);
> +			stat_dec_extent_count(inode->i_sb);
> +		}
>  	}
> -	read_unlock(&fi->ext.ext_lock);
> -	return 0;
>  }
>  
> -void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
> +inline void get_extent_info(struct inode *inode, struct f2fs_extent *i_ext)
>  {
> -	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
> -	pgoff_t fofs, start_fofs, end_fofs;
> -	block_t start_blkaddr, end_blkaddr;
> -	int need_update = true;
> +	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
> +	struct extent_info *ei;
> +
> +	write_lock(&tree->ei_lock);
> +	ei = __insert_extent_cache(inode, le32_to_cpu(i_ext->fofs),
> +			le32_to_cpu(i_ext->len), le32_to_cpu(i_ext->blk_addr));
> +	tree->cached_ei = ei;
> +	write_unlock(&tree->ei_lock);
> +}
>  
> -	f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
> -	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
> -							dn->ofs_in_node;
> +inline void set_raw_extent(struct f2fs_ei_tree *tree,
> +					struct f2fs_extent *i_ext)
> +{
> +	struct extent_info *ei = tree->cached_ei;
>  
> -	/* Update the page address in the parent node */
> -	__set_data_blkaddr(dn, blk_addr);
> +	read_lock(&tree->ei_lock);
> +	if (ei) {
> +		i_ext->fofs = cpu_to_le32(ei->fofs);
> +		i_ext->blk_addr = cpu_to_le32(ei->blk);
> +		i_ext->len = cpu_to_le32(ei->len);
> +	}
> +	read_unlock(&tree->ei_lock);
> +}
>  
> -	if (is_inode_flag_set(fi, FI_NO_EXTENT))
> -		return;
> +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
> +							struct extent_info *ei)
> +{
> +	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
> +	struct rb_node *node;
> +	struct extent_info *pei;
>  
> -	write_lock(&fi->ext.ext_lock);
> +	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
> +		return false;
>  
> -	start_fofs = fi->ext.fofs;
> -	end_fofs = fi->ext.fofs + fi->ext.len - 1;
> -	start_blkaddr = fi->ext.blk_addr;
> -	end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
> +	read_lock(&tree->ei_lock);
>  
> -	/* Drop and initialize the matched extent */
> -	if (fi->ext.len == 1 && fofs == start_fofs)
> -		fi->ext.len = 0;
> +	stat_inc_total_hit(inode->i_sb);
>  
> -	/* Initial extent */
> -	if (fi->ext.len == 0) {
> -		if (blk_addr != NULL_ADDR) {
> -			fi->ext.fofs = fofs;
> -			fi->ext.blk_addr = blk_addr;
> -			fi->ext.len = 1;
> -		}
> -		goto end_update;
> +	/* #1: find recently accessed extent info firstly */
> +	if (tree->cached_ei) {
> +		pei = tree->cached_ei;
> +		if (pgofs >= pei->fofs && pgofs < pei->fofs + pei->len)
> +			goto found;
>  	}
>  
> -	/* Front merge */
> -	if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
> -		fi->ext.fofs--;
> -		fi->ext.blk_addr--;
> -		fi->ext.len++;
> -		goto end_update;
> +	/* #2: find in rb tree */
> +	node = tree->root.rb_node;
> +	while (node) {
> +		pei = rb_entry(node, struct extent_info, rb_node);
> +		if (pgofs < pei->fofs)
> +			node = node->rb_left;
> +		else if (pgofs >= pei->fofs + pei->len)
> +			node = node->rb_right;
> +		else
> +			goto found;
>  	}
>  
> -	/* Back merge */
> -	if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
> -		fi->ext.len++;
> -		goto end_update;
> -	}
> +	read_unlock(&tree->ei_lock);
> +	return false;
> +found:
> +	ei->fofs = pei->fofs;
> +	ei->blk = pei->blk;
> +	ei->len = pei->len;
> +	stat_inc_read_hit(inode->i_sb);
> +	read_unlock(&tree->ei_lock);
> +	return true;
> +}
>  
> -	/* Split the existing extent */
> -	if (fi->ext.len > 1 &&
> -		fofs >= start_fofs && fofs <= end_fofs) {
> -		if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
> -			fi->ext.len = fofs - start_fofs;
> -		} else {
> -			fi->ext.fofs = fofs + 1;
> -			fi->ext.blk_addr = start_blkaddr +
> -					fofs - start_fofs + 1;
> -			fi->ext.len -= fofs - start_fofs + 1;
> -		}
> -	} else {
> -		need_update = false;
> +void f2fs_update_extent_cache(struct inode *inode, pgoff_t fofs,
> +							block_t blk_addr)
> +{
> +	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
> +	struct extent_info ei;
> +	struct extent_info *pei = NULL;
> +	unsigned int endofs;
> +
> +	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
> +		return;
> +
> +	if (blk_addr == NEW_ADDR)
> +		return;
> +
> +	write_lock(&tree->ei_lock);
> +
> +	/* update old extent mapping */
> +	if (__remove_extent_cache(inode, fofs, &ei))
> +		goto add_extent;
> +	if (ei.len == 1)
> +		goto add_extent;
> +
> +	if (ei.fofs < fofs)
> +		__insert_extent_cache(inode, ei.fofs, fofs - ei.fofs, ei.blk);
> +
> +	endofs = ei.fofs + ei.len - 1;
> +	if (endofs > fofs)
> +		__insert_extent_cache(inode, fofs + 1, endofs - fofs,
> +						fofs - ei.fofs + ei.blk);
> +
> +add_extent:
> +	/* insert new mapping extent to rb tree */
> +	if (blk_addr) {
> +		pei = __insert_extent_cache(inode, fofs, 1, blk_addr);
> +		__try_merge_extent(inode, pei);
>  	}
>  
> -	/* Finally, if the extent is very fragmented, let's drop the cache. */
> -	if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
> -		fi->ext.len = 0;
> -		set_inode_flag(fi, FI_NO_EXTENT);
> -		need_update = true;
> +	if (pei)
> +		tree->cached_ei = pei;
> +
> +	write_unlock(&tree->ei_lock);
> +}
> +
> +void f2fs_free_extent_cache(struct inode *inode)
> +{
> +	struct f2fs_ei_tree *tree = &F2FS_I(inode)->ei_tree;
> +	struct rb_node *node, *next;
> +	struct extent_info *ei;
> +
> +	write_lock(&tree->ei_lock);
> +	node = rb_first(&tree->root);
> +	while (node) {
> +		next = rb_next(node);
> +		ei = rb_entry(node, struct extent_info, rb_node);
> +		rb_erase(node, &tree->root);
> +		kmem_cache_free(extent_info_cache, ei);
> +		stat_dec_extent_count(inode->i_sb);
> +		node = next;
>  	}
> -end_update:
> -	write_unlock(&fi->ext.ext_lock);
> -	if (need_update)
> -		sync_inode_page(dn);
> +	write_unlock(&tree->ei_lock);
> +}
> +
> +static void f2fs_map_bh(struct inode *inode, pgoff_t pgofs,
> +			struct extent_info *ei, struct buffer_head *bh_result)
> +{
> +	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
> +	pgoff_t start_fofs, end_fofs;
> +	block_t start_blkaddr;
> +	size_t count;
> +
> +	start_fofs = ei->fofs;
> +	end_fofs = ei->fofs + ei->len - 1;
> +	start_blkaddr = ei->blk;
> +
> +	clear_buffer_new(bh_result);
> +	map_bh(bh_result, inode->i_sb,
> +			start_blkaddr + pgofs - start_fofs);
> +	count = end_fofs - pgofs + 1;
> +	if (count < (UINT_MAX >> blkbits))
> +		bh_result->b_size = (count << blkbits);
> +	else
> +		bh_result->b_size = UINT_MAX;
> +}
> +
> +void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
> +{
> +	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
> +	pgoff_t fofs;
> +
> +	f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
> +	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
> +							dn->ofs_in_node;
> +
> +	/* Update the page address in the parent node */
> +	__set_data_blkaddr(dn, blk_addr);
> +
> +	f2fs_update_extent_cache(dn->inode, fofs, blk_addr);
>  	return;
>  }
>  
> @@ -603,12 +743,15 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
>  	pgoff_t pgofs, end_offset;
>  	int err = 0, ofs = 1;
>  	bool allocated = false;
> +	struct extent_info ei;
>  
>  	/* Get the page offset from the block offset(iblock) */
>  	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
>  
> -	if (check_extent_cache(inode, pgofs, bh_result))
> +	if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
> +		f2fs_map_bh(inode, pgofs, &ei, bh_result);
>  		goto out;
> +	}
>  
>  	if (create) {
>  		f2fs_balance_fs(F2FS_I_SB(inode));
> @@ -628,6 +771,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
>  
>  	if (dn.data_blkaddr != NULL_ADDR) {
>  		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
> +		f2fs_update_extent_cache(inode, pgofs, dn.data_blkaddr);
>  	} else if (create) {
>  		err = __allocate_data_block(&dn);
>  		if (err)
> @@ -672,6 +816,8 @@ get_next:
>  			allocated = true;
>  			blkaddr = dn.data_blkaddr;
>  		}
> +		if (!allocated)
> +			f2fs_update_extent_cache(inode, pgofs, blkaddr);
>  		/* Give more consecutive addresses for the readahead */
>  		if (blkaddr == (bh_result->b_blocknr + ofs)) {
>  			ofs++;
> @@ -1160,6 +1306,20 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
>  	return generic_block_bmap(mapping, block, get_data_block);
>  }
>  
> +int __init create_extent_cache(void)
> +{
> +	extent_info_cache = f2fs_kmem_cache_create("f2fs_extent_info_cache",
> +			sizeof(struct extent_info));
> +	if (!extent_info_cache)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +void destroy_extent_cache(void)
> +{
> +	kmem_cache_destroy(extent_info_cache);
> +}
> +
>  const struct address_space_operations f2fs_dblock_aops = {
>  	.readpage	= f2fs_read_data_page,
>  	.readpages	= f2fs_read_data_pages,
> diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
> index 91e8f69..115cec0 100644
> --- a/fs/f2fs/debug.c
> +++ b/fs/f2fs/debug.c
> @@ -35,6 +35,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
>  	/* validation check of the segment numbers */
>  	si->hit_ext = sbi->read_hit_ext;
>  	si->total_ext = sbi->total_hit_ext;
> +	si->ext_count = sbi->total_ext_count;
>  	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
>  	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
>  	si->ndirty_dirs = sbi->n_dirty_dirs;
> @@ -249,6 +250,7 @@ static int stat_show(struct seq_file *s, void *v)
>  		seq_printf(s, "  - node blocks : %d\n", si->node_blks);
>  		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
>  			   si->hit_ext, si->total_ext);
> +		seq_printf(s, "\nExtent Node Count: %d\n", si->ext_count);
>  		seq_puts(s, "\nBalancing F2FS Async:\n");
>  		seq_printf(s, "  - inmem: %4d\n",
>  			   si->inmem_pages);
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index ec58bb2..dab6fdb 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -263,10 +263,16 @@ enum {
>  #define F2FS_MIN_EXTENT_LEN	16	/* minimum extent length */
>  
>  struct extent_info {
> -	rwlock_t ext_lock;	/* rwlock for consistency */
> -	unsigned int fofs;	/* start offset in a file */
> -	u32 blk_addr;		/* start block address of the extent */
> -	unsigned int len;	/* length of the extent */
> +	struct rb_node rb_node;		/* rb node located in rb-tree */
> +	unsigned int fofs;		/* start offset in a file */
> +	u32 blk;			/* start block address of the extent */
> +	unsigned int len;		/* length of the extent */
> +};
> +
> +struct f2fs_ei_tree {
> +	struct rb_root root;		/* root of extent info rb-tree */
> +	struct extent_info *cached_ei;	/* recently accessed extent */
> +	rwlock_t ei_lock;		/* rwlock for consistency */
>  };
>  
>  /*
> @@ -294,7 +300,8 @@ struct f2fs_inode_info {
>  	unsigned int clevel;		/* maximum level of given file name */
>  	nid_t i_xattr_nid;		/* node id that contains xattrs */
>  	unsigned long long xattr_ver;	/* cp version of xattr modification */
> -	struct extent_info ext;		/* in-memory extent cache entry */
> +	struct extent_info ext;
> +	struct f2fs_ei_tree ei_tree;	/* in-memory extent cache entry */
>  	struct dir_inode_entry *dirty_dir;	/* the pointer of dirty dir */
>  
>  	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
> @@ -302,26 +309,6 @@ struct f2fs_inode_info {
>  	struct mutex inmem_lock;	/* lock for inmemory pages */
>  };
>  
> -static inline void get_extent_info(struct extent_info *ext,
> -					struct f2fs_extent i_ext)
> -{
> -	write_lock(&ext->ext_lock);
> -	ext->fofs = le32_to_cpu(i_ext.fofs);
> -	ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
> -	ext->len = le32_to_cpu(i_ext.len);
> -	write_unlock(&ext->ext_lock);
> -}
> -
> -static inline void set_raw_extent(struct extent_info *ext,
> -					struct f2fs_extent *i_ext)
> -{
> -	read_lock(&ext->ext_lock);
> -	i_ext->fofs = cpu_to_le32(ext->fofs);
> -	i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
> -	i_ext->len = cpu_to_le32(ext->len);
> -	read_unlock(&ext->ext_lock);
> -}
> -
>  struct f2fs_nm_info {
>  	block_t nat_blkaddr;		/* base disk address of NAT */
>  	nid_t max_nid;			/* maximum possible node ids */
> @@ -590,6 +577,7 @@ struct f2fs_sb_info {
>  	unsigned int segment_count[2];		/* # of allocated segments */
>  	unsigned int block_count[2];		/* # of allocated blocks */
>  	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
> +	int total_ext_count;                    /* extent cache node count */
>  	atomic_t inline_inode;			/* # of inline_data inodes */
>  	atomic_t inline_dir;			/* # of inline_dentry inodes */
>  	int bg_gc;				/* background gc calls */
> @@ -1462,12 +1450,17 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
>  						struct f2fs_io_info *);
>  int reserve_new_block(struct dnode_of_data *);
>  int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
> +void set_raw_extent(struct f2fs_ei_tree *, struct f2fs_extent *);
> +void get_extent_info(struct inode *, struct f2fs_extent *);
> +void f2fs_free_extent_cache(struct inode *);
>  void update_extent_cache(block_t, struct dnode_of_data *);
>  struct page *find_data_page(struct inode *, pgoff_t, bool);
>  struct page *get_lock_data_page(struct inode *, pgoff_t);
>  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
>  int do_write_data_page(struct page *, struct f2fs_io_info *);
>  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
> +int __init create_extent_cache(void);
> +void destroy_extent_cache(void);
>  
>  /*
>   * gc.c
> @@ -1495,7 +1488,7 @@ struct f2fs_stat_info {
>  	struct f2fs_sb_info *sbi;
>  	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
>  	int main_area_segs, main_area_sections, main_area_zones;
> -	int hit_ext, total_ext;
> +	int hit_ext, total_ext, ext_count;
>  	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
>  	int nats, sits, fnids;
>  	int total_count, utilization;
> @@ -1529,6 +1522,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
>  #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
>  #define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
>  #define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
> +#define stat_inc_extent_count(sb)	((F2FS_SB(sb))->total_ext_count++)
> +#define stat_dec_extent_count(sb)	((F2FS_SB(sb))->total_ext_count--)
>  #define stat_inc_inline_inode(inode)					\
>  	do {								\
>  		if (f2fs_has_inline_data(inode))			\
> @@ -1593,6 +1588,8 @@ void f2fs_destroy_root_stats(void);
>  #define stat_dec_dirty_dir(sbi)
>  #define stat_inc_total_hit(sb)
>  #define stat_inc_read_hit(sb)
> +#define stat_inc_extent_count(sb)
> +#define stat_dec_extent_count(sb)
>  #define stat_inc_inline_inode(inode)
>  #define stat_dec_inline_inode(inode)
>  #define stat_inc_inline_dir(inode)
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index 196cc78..ae0dd9b 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -137,7 +137,7 @@ static int do_read_inode(struct inode *inode)
>  	fi->i_pino = le32_to_cpu(ri->i_pino);
>  	fi->i_dir_level = ri->i_dir_level;
>  
> -	get_extent_info(&fi->ext, ri->i_ext);
> +	get_extent_info(inode, &ri->i_ext);
>  	get_inline_info(fi, ri);
>  
>  	/* check data exist */
> @@ -227,7 +227,7 @@ void update_inode(struct inode *inode, struct page *node_page)
>  	ri->i_links = cpu_to_le32(inode->i_nlink);
>  	ri->i_size = cpu_to_le64(i_size_read(inode));
>  	ri->i_blocks = cpu_to_le64(inode->i_blocks);
> -	set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
> +	set_raw_extent(&F2FS_I(inode)->ei_tree, &ri->i_ext);
>  	set_raw_inline(F2FS_I(inode), ri);
>  
>  	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
> @@ -335,6 +335,7 @@ void f2fs_evict_inode(struct inode *inode)
>  no_delete:
>  	stat_dec_inline_dir(inode);
>  	stat_dec_inline_inode(inode);
> +	f2fs_free_extent_cache(inode);
>  	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
>  	if (xnid)
>  		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index f71421d..f205493 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -381,7 +381,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>  	atomic_set(&fi->dirty_pages, 0);
>  	fi->i_current_depth = 1;
>  	fi->i_advise = 0;
> -	rwlock_init(&fi->ext.ext_lock);
> +	fi->ei_tree.root = RB_ROOT;
> +	fi->ei_tree.cached_ei = NULL;
> +	rwlock_init(&fi->ei_tree.ei_lock);
>  	init_rwsem(&fi->i_sem);
>  	INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
>  	INIT_LIST_HEAD(&fi->inmem_pages);
> @@ -1235,10 +1237,13 @@ static int __init init_f2fs_fs(void)
>  	err = create_checkpoint_caches();
>  	if (err)
>  		goto free_gc_caches;
> +	err = create_extent_cache();
> +	if (err)
> +		goto free_checkpoint_caches;
>  	f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
>  	if (!f2fs_kset) {
>  		err = -ENOMEM;
> -		goto free_checkpoint_caches;
> +		goto free_extent_cache;
>  	}
>  	err = register_filesystem(&f2fs_fs_type);
>  	if (err)
> @@ -1249,6 +1254,8 @@ static int __init init_f2fs_fs(void)
>  
>  free_kset:
>  	kset_unregister(f2fs_kset);
> +free_extent_cache:
> +	destroy_extent_cache();
>  free_checkpoint_caches:
>  	destroy_checkpoint_caches();
>  free_gc_caches:
> -- 
> 2.1.2
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-22  2:03 ` Changman Lee
@ 2014-12-22  7:10   ` Chao Yu
  2014-12-22 23:16     ` Jaegeuk Kim
  2014-12-23  4:30     ` Changman Lee
  2014-12-22  9:06   ` Chao Yu
  1 sibling, 2 replies; 26+ messages in thread
From: Chao Yu @ 2014-12-22  7:10 UTC (permalink / raw)
  To: 'Changman Lee'
  Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi Changman,

> -----Original Message-----
> From: Changman Lee [mailto:cm224.lee@samsung.com]
> Sent: Monday, December 22, 2014 10:03 AM
> To: Chao Yu
> Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Yu,
> 
> Good approach.

Thank you. :)

> As you know, however, f2fs breaks extent itself due to COW.

Yes, and sometimes f2fs use IPU when override writing, in this condition,
by using this approach we can cache more contiguous mapping extent for better
performance.

> Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> So we would have lots of extents per inode and it could lead to overhead
> to manage extents.

Agree, the more number of extents are growing in one inode, the more memory
pressure and longer latency operating in rb-tree we are facing.
IMO, to solve this problem, we'd better to add limitation or shrink ability into
extent cache:
1.limit extent number per inode with the value set from sysfs and discard extent
from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
of mapping extent per inode is fixed: 8)
2.add all extents of inodes into a global lru list, we will try to shrink this list
if we're facing memory pressure.

How do you think? or any better ideas are welcome. :)

> 
> Anyway, mount option could be alternative for this patch.

Yes, will do.

Thanks,
Yu

> 
> On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > between contiguous logical address and physical address.
> > Normally, this design will work well because f2fs will expand coverage area of
> > the mapping extent when we write forward sequentially. But when we write data
> > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > expanded for most time as following reasons:
> > 1.The short part of extent will be discarded if we break contiguous mapping in
> > the middle of extent.
> > 2.The new mapping will be added into mapping cache only at head or tail of the
> > extent.
> > 3.We will drop the extent cache when the extent became very fragmented.
> > 4.We will not update the extent with mapping which we get from readpages or
> > readpage.
> >
> > To solve above problems, this patch adds extent cache base on rb-tree like other
> > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > more effective cache between dnode page cache and disk. It will supply high hit
> > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > environment of low memory.
> >
> > Todo:
> > *introduce mount option for extent cache.
> > *add shrink ability for extent cache.
> >
> > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > ---



^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-22  2:03 ` Changman Lee
  2014-12-22  7:10   ` Chao Yu
@ 2014-12-22  9:06   ` Chao Yu
  1 sibling, 0 replies; 26+ messages in thread
From: Chao Yu @ 2014-12-22  9:06 UTC (permalink / raw)
  To: 'Changman Lee'
  Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi Changman,

> -----Original Message-----
> From: Changman Lee [mailto:cm224.lee@samsung.com]
> Sent: Monday, December 22, 2014 10:03 AM
> To: Chao Yu
> Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Yu,
> 
> Good approach.
> As you know, however, f2fs breaks extent itself due to COW.
> Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> So we would have lots of extents per inode and it could lead to overhead
> to manage extents.
> 
> Anyway, mount option could be alternative for this patch.
> 
> On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > between contiguous logical address and physical address.
> > Normally, this design will work well because f2fs will expand coverage area of
> > the mapping extent when we write forward sequentially. But when we write data
> > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > expanded for most time as following reasons:
> > 1.The short part of extent will be discarded if we break contiguous mapping in
> > the middle of extent.
> > 2.The new mapping will be added into mapping cache only at head or tail of the
> > extent.
> > 3.We will drop the extent cache when the extent became very fragmented.
> > 4.We will not update the extent with mapping which we get from readpages or
> > readpage.
> >
> > To solve above problems, this patch adds extent cache base on rb-tree like other
> > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > more effective cache between dnode page cache and disk. It will supply high hit
> > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > environment of low memory.
> >
> > Todo:
> > *introduce mount option for extent cache.
> > *add shrink ability for extent cache.
> >
> > Signed-off-by: Chao Yu <chao2.yu@samsung.com>

[snip]

> > +static void __try_merge_extent(struct inode *inode, struct extent_info *ei)
> > +{
> > +	struct rb_root *root = &F2FS_I(inode)->ei_tree.root;
> > +	struct extent_info *pei = NULL;
> > +	struct rb_node *node;
> > +
> > +	node = rb_prev(&ei->rb_node);
> > +	if (node) {
> > +		pei = rb_entry(node, struct extent_info, rb_node);
> > +		if (ei->blk == pei->blk + pei->len) {
> 
> Shouldn't we check below together, too?
> if (ei->fofs == pei->fofs + pei->len)

Yes, you're right.
The following "if (ei->blk + 1 == pei->blk)" has the same problem.
I will fix this issue, thanks for your review!

Regards,
Yu



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-22  7:10   ` Chao Yu
@ 2014-12-22 23:16     ` Jaegeuk Kim
  2014-12-23  3:01       ` Chao Yu
  2014-12-23  4:30     ` Changman Lee
  1 sibling, 1 reply; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-22 23:16 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> Hi Changman,
> 
> > -----Original Message-----
> > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > Sent: Monday, December 22, 2014 10:03 AM
> > To: Chao Yu
> > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Yu,
> > 
> > Good approach.
> 
> Thank you. :)
> 
> > As you know, however, f2fs breaks extent itself due to COW.
> 
> Yes, and sometimes f2fs use IPU when override writing, in this condition,
> by using this approach we can cache more contiguous mapping extent for better
> performance.

Hmm. When f2fs faces with this case, there is no chance to make an extent itself
at all.

> 
> > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > So we would have lots of extents per inode and it could lead to overhead
> > to manage extents.
> 
> Agree, the more number of extents are growing in one inode, the more memory
> pressure and longer latency operating in rb-tree we are facing.
> IMO, to solve this problem, we'd better to add limitation or shrink ability into
> extent cache:
> 1.limit extent number per inode with the value set from sysfs and discard extent
> from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> of mapping extent per inode is fixed: 8)
> 2.add all extents of inodes into a global lru list, we will try to shrink this list
> if we're facing memory pressure.
> 
> How do you think? or any better ideas are welcome. :)

Historically, the reason that I added only one small extent cache is that I
wanted to avoid additional data structures having any overhead in critical data
write path.
Instead, I intended to use a well operating node page cache.

We need to consider what would be the benefit when using extent cache rather
than existing node page cache.

Thanks,

> 
> > 
> > Anyway, mount option could be alternative for this patch.
> 
> Yes, will do.
> 
> Thanks,
> Yu
> 
> > 
> > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > between contiguous logical address and physical address.
> > > Normally, this design will work well because f2fs will expand coverage area of
> > > the mapping extent when we write forward sequentially. But when we write data
> > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > expanded for most time as following reasons:
> > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > the middle of extent.
> > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > extent.
> > > 3.We will drop the extent cache when the extent became very fragmented.
> > > 4.We will not update the extent with mapping which we get from readpages or
> > > readpage.
> > >
> > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > more effective cache between dnode page cache and disk. It will supply high hit
> > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > environment of low memory.
> > >
> > > Todo:
> > > *introduce mount option for extent cache.
> > > *add shrink ability for extent cache.
> > >
> > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > ---

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-22 23:16     ` Jaegeuk Kim
@ 2014-12-23  3:01       ` Chao Yu
  2014-12-23  7:36         ` Jaegeuk Kim
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-23  3:01 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Tuesday, December 23, 2014 7:16 AM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > Hi Changman,
> >
> > > -----Original Message-----
> > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > Sent: Monday, December 22, 2014 10:03 AM
> > > To: Chao Yu
> > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Yu,
> > >
> > > Good approach.
> >
> > Thank you. :)
> >
> > > As you know, however, f2fs breaks extent itself due to COW.
> >
> > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > by using this approach we can cache more contiguous mapping extent for better
> > performance.
> 
> Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> at all.

With new implementation of this patch f2fs will build extent cache when readpage/readpages.

> 
> >
> > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > So we would have lots of extents per inode and it could lead to overhead
> > > to manage extents.
> >
> > Agree, the more number of extents are growing in one inode, the more memory
> > pressure and longer latency operating in rb-tree we are facing.
> > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > extent cache:
> > 1.limit extent number per inode with the value set from sysfs and discard extent
> > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > of mapping extent per inode is fixed: 8)
> > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > if we're facing memory pressure.
> >
> > How do you think? or any better ideas are welcome. :)
> 
> Historically, the reason that I added only one small extent cache is that I
> wanted to avoid additional data structures having any overhead in critical data
> write path.

Thank you for telling me the history of original extent cache.

> Instead, I intended to use a well operating node page cache.
> 
> We need to consider what would be the benefit when using extent cache rather
> than existing node page cache.

IMO, node page cache belongs to system level cache, filesystem sub system can
not control it completely, cached uptodate node page will be invalidated by
using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
these node page next time.
New extent cache belongs to filesystem level cache, it is completely controlled
by filesystem itself. What we can profit is: on the one hand, it is used as
first level cache above the node page cache, which can also increase the cache
hit ratio. On the other hand, it is more instable and controllable than node page
cache.

Thanks,
Yu

> 
> Thanks,
> 
> >
> > >
> > > Anyway, mount option could be alternative for this patch.
> >
> > Yes, will do.
> >
> > Thanks,
> > Yu
> >
> > >
> > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > between contiguous logical address and physical address.
> > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > the mapping extent when we write forward sequentially. But when we write data
> > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > expanded for most time as following reasons:
> > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > the middle of extent.
> > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > extent.
> > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > readpage.
> > > >
> > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > environment of low memory.
> > > >
> > > > Todo:
> > > > *introduce mount option for extent cache.
> > > > *add shrink ability for extent cache.
> > > >
> > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > ---


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-22  7:10   ` Chao Yu
  2014-12-22 23:16     ` Jaegeuk Kim
@ 2014-12-23  4:30     ` Changman Lee
  2014-12-23  9:35       ` Chao Yu
  1 sibling, 1 reply; 26+ messages in thread
From: Changman Lee @ 2014-12-23  4:30 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi,

On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> Hi Changman,
> 
> > -----Original Message-----
> > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > Sent: Monday, December 22, 2014 10:03 AM
> > To: Chao Yu
> > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Yu,
> > 
> > Good approach.
> 
> Thank you. :)
> 
> > As you know, however, f2fs breaks extent itself due to COW.
> 
> Yes, and sometimes f2fs use IPU when override writing, in this condition,
> by using this approach we can cache more contiguous mapping extent for better
> performance.
> 
> > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > So we would have lots of extents per inode and it could lead to overhead
> > to manage extents.
> 
> Agree, the more number of extents are growing in one inode, the more memory
> pressure and longer latency operating in rb-tree we are facing.
> IMO, to solve this problem, we'd better to add limitation or shrink ability into
> extent cache:
> 1.limit extent number per inode with the value set from sysfs and discard extent
> from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> of mapping extent per inode is fixed: 8)
> 2.add all extents of inodes into a global lru list, we will try to shrink this list
> if we're facing memory pressure.
> 
> How do you think? or any better ideas are welcome. :)
> 

I think both of them are considerable options.
How about adding extent to inode selected by user using ioctl or xattr?
In the case of read most files having large size, user could get a benefit
surely although they are seperated some pieces.

Thanks,

> > 
> > Anyway, mount option could be alternative for this patch.
> 
> Yes, will do.
> 
> Thanks,
> Yu
> 
> > 
> > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > between contiguous logical address and physical address.
> > > Normally, this design will work well because f2fs will expand coverage area of
> > > the mapping extent when we write forward sequentially. But when we write data
> > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > expanded for most time as following reasons:
> > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > the middle of extent.
> > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > extent.
> > > 3.We will drop the extent cache when the extent became very fragmented.
> > > 4.We will not update the extent with mapping which we get from readpages or
> > > readpage.
> > >
> > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > more effective cache between dnode page cache and disk. It will supply high hit
> > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > environment of low memory.
> > >
> > > Todo:
> > > *introduce mount option for extent cache.
> > > *add shrink ability for extent cache.
> > >
> > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > ---
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-23  3:01       ` Chao Yu
@ 2014-12-23  7:36         ` Jaegeuk Kim
  2014-12-23  8:45           ` Changman Lee
  2014-12-24  8:01           ` Chao Yu
  0 siblings, 2 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-23  7:36 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Tue, Dec 23, 2014 at 11:01:39AM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Tuesday, December 23, 2014 7:16 AM
> > To: Chao Yu
> > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Chao,
> > 
> > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > Hi Changman,
> > >
> > > > -----Original Message-----
> > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > To: Chao Yu
> > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Yu,
> > > >
> > > > Good approach.
> > >
> > > Thank you. :)
> > >
> > > > As you know, however, f2fs breaks extent itself due to COW.
> > >
> > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > by using this approach we can cache more contiguous mapping extent for better
> > > performance.
> > 
> > Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> > at all.
> 
> With new implementation of this patch f2fs will build extent cache when readpage/readpages.

I don't understand your points exactly. :(
If there are no on-disk extents, it doesn't matter when the caches are built.
Could you define what scenarios you're looking at?

> 
> > 
> > >
> > > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > > So we would have lots of extents per inode and it could lead to overhead
> > > > to manage extents.
> > >
> > > Agree, the more number of extents are growing in one inode, the more memory
> > > pressure and longer latency operating in rb-tree we are facing.
> > > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > > extent cache:
> > > 1.limit extent number per inode with the value set from sysfs and discard extent
> > > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > > of mapping extent per inode is fixed: 8)
> > > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > > if we're facing memory pressure.
> > >
> > > How do you think? or any better ideas are welcome. :)
> > 
> > Historically, the reason that I added only one small extent cache is that I
> > wanted to avoid additional data structures having any overhead in critical data
> > write path.
> 
> Thank you for telling me the history of original extent cache.
> 
> > Instead, I intended to use a well operating node page cache.
> > 
> > We need to consider what would be the benefit when using extent cache rather
> > than existing node page cache.
> 
> IMO, node page cache belongs to system level cache, filesystem sub system can
> not control it completely, cached uptodate node page will be invalidated by
> using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
> these node page next time.

Yes, that's exactly what I wanted.

> New extent cache belongs to filesystem level cache, it is completely controlled
> by filesystem itself. What we can profit is: on the one hand, it is used as
> first level cache above the node page cache, which can also increase the cache
> hit ratio.

I don't think so. The hit ratio depends on the cache policy. The node page
cache is managed globally by kernel in LRU manner, so I think this can show
affordable hit ratio.

> On the other hand, it is more instable and controllable than node page
> cache.

It depends on how you can control the extent cache. But, I'm not sure that
would be better than page cache managed by MM.

So, my concerns are:

1. Redundant memory overhead
 : The extent cache is likely on top of the node page cache, which will consume 
 memory redundantly.

2. CPU overhead
 : In every block address updates, it needs to traverse extent cache entries.

3. Effectiveness
 : We have a node page cache that is managed by MM in LRU order. I think this
 provides good hit ratio, system-wide memory relciaming algorithms, and well-
 defined locking mechanism.

4. Cache reclaiming policy
 a. global approach: it needs to consider lock contention, CPU overhead, and
                     shrinker. I don't think it is better than page cache.
 b. local approach: there still exists cold misses at the initial read
                    operations. After then, how does the extent cache increase
		    hit ratio more than giving node page cache?

		    For example, in the case of pretty normal scenario like
		    open -> read -> close -> open -> read ..., we can't get
		    benefits form locally-managed extent cache, while node
		    page cache serves many block addresses.

This is my initial thought on the extent cache.
Definitely, it is worth to discuss further in more details.

Thanks,

> 
> Thanks,
> Yu
> 
> > 
> > Thanks,
> > 
> > >
> > > >
> > > > Anyway, mount option could be alternative for this patch.
> > >
> > > Yes, will do.
> > >
> > > Thanks,
> > > Yu
> > >
> > > >
> > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > between contiguous logical address and physical address.
> > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > expanded for most time as following reasons:
> > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > the middle of extent.
> > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > extent.
> > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > readpage.
> > > > >
> > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > environment of low memory.
> > > > >
> > > > > Todo:
> > > > > *introduce mount option for extent cache.
> > > > > *add shrink ability for extent cache.
> > > > >
> > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > ---

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-23  7:36         ` Jaegeuk Kim
@ 2014-12-23  8:45           ` Changman Lee
  2014-12-24  8:01           ` Chao Yu
  1 sibling, 0 replies; 26+ messages in thread
From: Changman Lee @ 2014-12-23  8:45 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: Chao Yu, linux-f2fs-devel, linux-kernel

On Mon, Dec 22, 2014 at 11:36:09PM -0800, Jaegeuk Kim wrote:
> Hi Chao,
> 
> On Tue, Dec 23, 2014 at 11:01:39AM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> > 
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Tuesday, December 23, 2014 7:16 AM
> > > To: Chao Yu
> > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > 
> > > Hi Chao,
> > > 
> > > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > > Hi Changman,
> > > >
> > > > > -----Original Message-----
> > > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > > To: Chao Yu
> > > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > >
> > > > > Hi Yu,
> > > > >
> > > > > Good approach.
> > > >
> > > > Thank you. :)
> > > >
> > > > > As you know, however, f2fs breaks extent itself due to COW.
> > > >
> > > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > > by using this approach we can cache more contiguous mapping extent for better
> > > > performance.
> > > 
> > > Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> > > at all.
> > 
> > With new implementation of this patch f2fs will build extent cache when readpage/readpages.
> 
> I don't understand your points exactly. :(
> If there are no on-disk extents, it doesn't matter when the caches are built.
> Could you define what scenarios you're looking at?
> 
> > 
> > > 
> > > >
> > > > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > > > So we would have lots of extents per inode and it could lead to overhead
> > > > > to manage extents.
> > > >
> > > > Agree, the more number of extents are growing in one inode, the more memory
> > > > pressure and longer latency operating in rb-tree we are facing.
> > > > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > > > extent cache:
> > > > 1.limit extent number per inode with the value set from sysfs and discard extent
> > > > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > > > of mapping extent per inode is fixed: 8)
> > > > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > > > if we're facing memory pressure.
> > > >
> > > > How do you think? or any better ideas are welcome. :)
> > > 
> > > Historically, the reason that I added only one small extent cache is that I
> > > wanted to avoid additional data structures having any overhead in critical data
> > > write path.
> > 
> > Thank you for telling me the history of original extent cache.
> > 
> > > Instead, I intended to use a well operating node page cache.
> > > 
> > > We need to consider what would be the benefit when using extent cache rather
> > > than existing node page cache.
> > 
> > IMO, node page cache belongs to system level cache, filesystem sub system can
> > not control it completely, cached uptodate node page will be invalidated by
> > using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
> > these node page next time.
> 
> Yes, that's exactly what I wanted.
> 
> > New extent cache belongs to filesystem level cache, it is completely controlled
> > by filesystem itself. What we can profit is: on the one hand, it is used as
> > first level cache above the node page cache, which can also increase the cache
> > hit ratio.
> 
> I don't think so. The hit ratio depends on the cache policy. The node page
> cache is managed globally by kernel in LRU manner, so I think this can show
> affordable hit ratio.
> 
> > On the other hand, it is more instable and controllable than node page
> > cache.
> 
> It depends on how you can control the extent cache. But, I'm not sure that
> would be better than page cache managed by MM.
> 
> So, my concerns are:
> 
> 1. Redundant memory overhead
>  : The extent cache is likely on top of the node page cache, which will consume 
>  memory redundantly.
> 
> 2. CPU overhead
>  : In every block address updates, it needs to traverse extent cache entries.
> 
> 3. Effectiveness
>  : We have a node page cache that is managed by MM in LRU order. I think this
>  provides good hit ratio, system-wide memory relciaming algorithms, and well-
>  defined locking mechanism.
> 
> 4. Cache reclaiming policy
>  a. global approach: it needs to consider lock contention, CPU overhead, and
>                      shrinker. I don't think it is better than page cache.
>  b. local approach: there still exists cold misses at the initial read
>                     operations. After then, how does the extent cache increase
> 		    hit ratio more than giving node page cache?
> 
> 		    For example, in the case of pretty normal scenario like
> 		    open -> read -> close -> open -> read ..., we can't get
> 		    benefits form locally-managed extent cache, while node
> 		    page cache serves many block addresses.

I think we can solve the problem you pointed by managing global extent cache with i_ino.
If so, we don't lose extent caches.
And We can control extent caches by LRU as memory reclaim. (this is Chao's idea)

> 
> This is my initial thought on the extent cache.
> Definitely, it is worth to discuss further in more details.

Neverthless, I think Chao's suggestion have some benefits.
It needs more memory to keep node pages than extent caches.
The extent is better effective because it covers greater space as using
small memory.
I told this before but again, how about using ioctl or xattr for caching extent?
User could get a benefit for read most files which are fragmented in a
few chunks.

If we add some ideas on Chao's patch, we will get good result.

Regards,
Changman

> 
> Thanks,
> 
> > 
> > Thanks,
> > Yu
> > 
> > > 
> > > Thanks,
> > > 
> > > >
> > > > >
> > > > > Anyway, mount option could be alternative for this patch.
> > > >
> > > > Yes, will do.
> > > >
> > > > Thanks,
> > > > Yu
> > > >
> > > > >
> > > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > > between contiguous logical address and physical address.
> > > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > > expanded for most time as following reasons:
> > > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > > the middle of extent.
> > > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > > extent.
> > > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > > readpage.
> > > > > >
> > > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > > environment of low memory.
> > > > > >
> > > > > > Todo:
> > > > > > *introduce mount option for extent cache.
> > > > > > *add shrink ability for extent cache.
> > > > > >
> > > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > > ---

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-23  4:30     ` Changman Lee
@ 2014-12-23  9:35       ` Chao Yu
  2014-12-23 19:24           ` Jaegeuk Kim
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-23  9:35 UTC (permalink / raw)
  To: 'Changman Lee'
  Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi Changman,

> -----Original Message-----
> From: Changman Lee [mailto:cm224.lee@samsung.com]
> Sent: Tuesday, December 23, 2014 12:31 PM
> To: Chao Yu
> Cc: 'Jaegeuk Kim'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi,
> 
> On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > Hi Changman,
> >
> > > -----Original Message-----
> > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > Sent: Monday, December 22, 2014 10:03 AM
> > > To: Chao Yu
> > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Yu,
> > >
> > > Good approach.
> >
> > Thank you. :)
> >
> > > As you know, however, f2fs breaks extent itself due to COW.
> >
> > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > by using this approach we can cache more contiguous mapping extent for better
> > performance.
> >
> > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > So we would have lots of extents per inode and it could lead to overhead
> > > to manage extents.
> >
> > Agree, the more number of extents are growing in one inode, the more memory
> > pressure and longer latency operating in rb-tree we are facing.
> > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > extent cache:
> > 1.limit extent number per inode with the value set from sysfs and discard extent
> > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > of mapping extent per inode is fixed: 8)
> > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > if we're facing memory pressure.
> >
> > How do you think? or any better ideas are welcome. :)
> >
> 
> I think both of them are considerable options.
> How about adding extent to inode selected by user using ioctl or xattr?
> In the case of read most files having large size, user could get a benefit
> surely although they are seperated some pieces.

Yes, I agree, I prefer ioctl more than xattr, as xattr will take over some extra
space of inode, and also new added xattr name may conflict with exist extent xattr.

Anyway, this will provide more flexible management of extent cache to users of f2fs.

Thank you for your idea!

Regards,
Yu

> 
> Thanks,
> 
> > >
> > > Anyway, mount option could be alternative for this patch.
> >
> > Yes, will do.
> >
> > Thanks,
> > Yu
> >
> > >
> > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > between contiguous logical address and physical address.
> > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > the mapping extent when we write forward sequentially. But when we write data
> > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > expanded for most time as following reasons:
> > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > the middle of extent.
> > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > extent.
> > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > readpage.
> > > >
> > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > environment of low memory.
> > > >
> > > > Todo:
> > > > *introduce mount option for extent cache.
> > > > *add shrink ability for extent cache.
> > > >
> > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > ---
> >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-23  9:35       ` Chao Yu
@ 2014-12-23 19:24           ` Jaegeuk Kim
  0 siblings, 0 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-23 19:24 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

On Tue, Dec 23, 2014 at 05:35:11PM +0800, Chao Yu wrote:
> Hi Changman,
> 
> > -----Original Message-----
> > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > Sent: Tuesday, December 23, 2014 12:31 PM
> > To: Chao Yu
> > Cc: 'Jaegeuk Kim'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi,
> > 
> > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > Hi Changman,
> > >
> > > > -----Original Message-----
> > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > To: Chao Yu
> > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Yu,
> > > >
> > > > Good approach.
> > >
> > > Thank you. :)
> > >
> > > > As you know, however, f2fs breaks extent itself due to COW.
> > >
> > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > by using this approach we can cache more contiguous mapping extent for better
> > > performance.
> > >
> > > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > > So we would have lots of extents per inode and it could lead to overhead
> > > > to manage extents.
> > >
> > > Agree, the more number of extents are growing in one inode, the more memory
> > > pressure and longer latency operating in rb-tree we are facing.
> > > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > > extent cache:
> > > 1.limit extent number per inode with the value set from sysfs and discard extent
> > > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > > of mapping extent per inode is fixed: 8)
> > > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > > if we're facing memory pressure.
> > >
> > > How do you think? or any better ideas are welcome. :)
> > >
> > 
> > I think both of them are considerable options.
> > How about adding extent to inode selected by user using ioctl or xattr?
> > In the case of read most files having large size, user could get a benefit
> > surely although they are seperated some pieces.
> 
> Yes, I agree, I prefer ioctl more than xattr, as xattr will take over some extra
> space of inode, and also new added xattr name may conflict with exist extent xattr.
> 
> Anyway, this will provide more flexible management of extent cache to users of f2fs.

Oh, well.
This is not a good way, since we cannot guide users to call another system call
for their files.

Instead, it needs to check fadvise hints or RDONLY in the open flag?

Thanks,

> 
> Thank you for your idea!
> 
> Regards,
> Yu
> 
> > 
> > Thanks,
> > 
> > > >
> > > > Anyway, mount option could be alternative for this patch.
> > >
> > > Yes, will do.
> > >
> > > Thanks,
> > > Yu
> > >
> > > >
> > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > between contiguous logical address and physical address.
> > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > expanded for most time as following reasons:
> > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > the middle of extent.
> > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > extent.
> > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > readpage.
> > > > >
> > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > environment of low memory.
> > > > >
> > > > > Todo:
> > > > > *introduce mount option for extent cache.
> > > > > *add shrink ability for extent cache.
> > > > >
> > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > ---
> > >

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
@ 2014-12-23 19:24           ` Jaegeuk Kim
  0 siblings, 0 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-23 19:24 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On Tue, Dec 23, 2014 at 05:35:11PM +0800, Chao Yu wrote:
> Hi Changman,
> 
> > -----Original Message-----
> > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > Sent: Tuesday, December 23, 2014 12:31 PM
> > To: Chao Yu
> > Cc: 'Jaegeuk Kim'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi,
> > 
> > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > Hi Changman,
> > >
> > > > -----Original Message-----
> > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > To: Chao Yu
> > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Yu,
> > > >
> > > > Good approach.
> > >
> > > Thank you. :)
> > >
> > > > As you know, however, f2fs breaks extent itself due to COW.
> > >
> > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > by using this approach we can cache more contiguous mapping extent for better
> > > performance.
> > >
> > > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > > So we would have lots of extents per inode and it could lead to overhead
> > > > to manage extents.
> > >
> > > Agree, the more number of extents are growing in one inode, the more memory
> > > pressure and longer latency operating in rb-tree we are facing.
> > > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > > extent cache:
> > > 1.limit extent number per inode with the value set from sysfs and discard extent
> > > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > > of mapping extent per inode is fixed: 8)
> > > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > > if we're facing memory pressure.
> > >
> > > How do you think? or any better ideas are welcome. :)
> > >
> > 
> > I think both of them are considerable options.
> > How about adding extent to inode selected by user using ioctl or xattr?
> > In the case of read most files having large size, user could get a benefit
> > surely although they are seperated some pieces.
> 
> Yes, I agree, I prefer ioctl more than xattr, as xattr will take over some extra
> space of inode, and also new added xattr name may conflict with exist extent xattr.
> 
> Anyway, this will provide more flexible management of extent cache to users of f2fs.

Oh, well.
This is not a good way, since we cannot guide users to call another system call
for their files.

Instead, it needs to check fadvise hints or RDONLY in the open flag?

Thanks,

> 
> Thank you for your idea!
> 
> Regards,
> Yu
> 
> > 
> > Thanks,
> > 
> > > >
> > > > Anyway, mount option could be alternative for this patch.
> > >
> > > Yes, will do.
> > >
> > > Thanks,
> > > Yu
> > >
> > > >
> > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > between contiguous logical address and physical address.
> > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > expanded for most time as following reasons:
> > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > the middle of extent.
> > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > extent.
> > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > readpage.
> > > > >
> > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > environment of low memory.
> > > > >
> > > > > Todo:
> > > > > *introduce mount option for extent cache.
> > > > > *add shrink ability for extent cache.
> > > > >
> > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > ---
> > >

------------------------------------------------------------------------------
Dive into the World of Parallel Programming! The Go Parallel Website,
sponsored by Intel and developed in partnership with Slashdot Media, is your
hub for all things parallel software development, from weekly thought
leadership blogs to news, videos, case studies, tutorials and more. Take a
look and join the conversation now. http://goparallel.sourceforge.net

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-23  7:36         ` Jaegeuk Kim
  2014-12-23  8:45           ` Changman Lee
@ 2014-12-24  8:01           ` Chao Yu
  2014-12-25  8:35             ` Jaegeuk Kim
  1 sibling, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-24  8:01 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Tuesday, December 23, 2014 3:36 PM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Tue, Dec 23, 2014 at 11:01:39AM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Tuesday, December 23, 2014 7:16 AM
> > > To: Chao Yu
> > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Chao,
> > >
> > > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > > Hi Changman,
> > > >
> > > > > -----Original Message-----
> > > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > > To: Chao Yu
> > > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > >
> > > > > Hi Yu,
> > > > >
> > > > > Good approach.
> > > >
> > > > Thank you. :)
> > > >
> > > > > As you know, however, f2fs breaks extent itself due to COW.
> > > >
> > > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > > by using this approach we can cache more contiguous mapping extent for better
> > > > performance.
> > >
> > > Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> > > at all.
> >
> > With new implementation of this patch f2fs will build extent cache when readpage/readpages.
> 
> I don't understand your points exactly. :(
> If there are no on-disk extents, it doesn't matter when the caches are built.
> Could you define what scenarios you're looking at?

What I mean is that IPU will not split the exist extent in extent cache; and
this exist extent cache was been built when we init cache with last accessed extent
(the only on-disk extent) which was stored in inode, or be built when we invoke
get_data_block in readpage/readpages in IPU mode. So there is a chance to make
extent in this scenario.

> 
> >
> > >
> > > >
> > > > > Unlike other filesystem like btrfs, minimum extent of f2fs could have 4KB granularity.
> > > > > So we would have lots of extents per inode and it could lead to overhead
> > > > > to manage extents.
> > > >
> > > > Agree, the more number of extents are growing in one inode, the more memory
> > > > pressure and longer latency operating in rb-tree we are facing.
> > > > IMO, to solve this problem, we'd better to add limitation or shrink ability into
> > > > extent cache:
> > > > 1.limit extent number per inode with the value set from sysfs and discard extent
> > > > from inode's extent lru list if we touch the limitation; (e.g. in FAT, max number
> > > > of mapping extent per inode is fixed: 8)
> > > > 2.add all extents of inodes into a global lru list, we will try to shrink this list
> > > > if we're facing memory pressure.
> > > >
> > > > How do you think? or any better ideas are welcome. :)
> > >
> > > Historically, the reason that I added only one small extent cache is that I
> > > wanted to avoid additional data structures having any overhead in critical data
> > > write path.
> >
> > Thank you for telling me the history of original extent cache.
> >
> > > Instead, I intended to use a well operating node page cache.
> > >
> > > We need to consider what would be the benefit when using extent cache rather
> > > than existing node page cache.
> >
> > IMO, node page cache belongs to system level cache, filesystem sub system can
> > not control it completely, cached uptodate node page will be invalidated by
> > using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
> > these node page next time.
> 
> Yes, that's exactly what I wanted.

IMO, cost is expensive when we read node page again if these pages are invalidated
by MM. In the worst case, we will read 3 ind-node blocks + 1 dnode block + 3 NAT blocks
from disk for one blkaddr.

> 
> > New extent cache belongs to filesystem level cache, it is completely controlled
> > by filesystem itself. What we can profit is: on the one hand, it is used as
> > first level cache above the node page cache, which can also increase the cache
> > hit ratio.
> 
> I don't think so. The hit ratio depends on the cache policy. The node page
> cache is managed globally by kernel in LRU manner, so I think this can show
> affordable hit ratio.

As I test, in this scenario we can have higher ratio hit in new extent cache
than original extent cache:
1.write a large file
2.write randomly in this file
3.drop cache through drop_caches entry (or reclaim by MM)
4.read this large file

We cache all segregated extents in inode, so our ratio hit is 100% in
above scenario. If we add cache policy in extent cache, our hit ratio
will drop. But added policy can provide us more choose for different hit
ratio (may affect IO count) requirement of users.

> 
> > On the other hand, it is more instable and controllable than node page
> > cache.
> 
> It depends on how you can control the extent cache. But, I'm not sure that
> would be better than page cache managed by MM.
> 
> So, my concerns are:
> 
> 1. Redundant memory overhead
>  : The extent cache is likely on top of the node page cache, which will consume
>  memory redundantly.
> 
> 2. CPU overhead
>  : In every block address updates, it needs to traverse extent cache entries.

As I mentioned above, if extent cache has the ability of limitation and shrink,
overhead of memory and CPU can be controlled.

> 
> 3. Effectiveness
>  : We have a node page cache that is managed by MM in LRU order. I think this
>  provides good hit ratio, system-wide memory relciaming algorithms, and well-
>  defined locking mechanism.

The effectiveness of new extent cache is the key point we should discuss.

IMO, hit ratio of extent cache and memory/CPU cost can be tradeoff.
For example:
1.if limitation is on, max value is 1; shrink is off:
	our extent cache will be the same as original extent cache.
2.if limitation is off, shrink is off:
	our extent cache can provide high hit ratio when node page cache is
	invalid but there is more memory/CPU overhead.
3.there are more status between label 1 and label 2 as limitation and shrink is
set to different value.
So we just develop our extent cache for more functionality and selective.

Another point is that, now size of struct extent_info is 24 bytes which is much
smaller than 4096 of node page size (ind-node page size is not calced), it's
cost-efficient to use small memory to store contiguous mapping. (This is pointed
out by Changman also)

> 
> 4. Cache reclaiming policy
>  a. global approach: it needs to consider lock contention, CPU overhead, and
>                      shrinker. I don't think it is better than page cache.
>  b. local approach: there still exists cold misses at the initial read
>                     operations. After then, how does the extent cache increase
> 		    hit ratio more than giving node page cache?
> 
> 		    For example, in the case of pretty normal scenario like
> 		    open -> read -> close -> open -> read ..., we can't get
> 		    benefits form locally-managed extent cache, while node
> 		    page cache serves many block addresses.

If this case should be covered, how about remembering these recently accessed extent
in a global list when evict, recovering extent cache from list when re-open?

Thanks,
Yu

> 
> This is my initial thought on the extent cache.
> Definitely, it is worth to discuss further in more details.
> 
> Thanks,
> 
> >
> > Thanks,
> > Yu
> >
> > >
> > > Thanks,
> > >
> > > >
> > > > >
> > > > > Anyway, mount option could be alternative for this patch.
> > > >
> > > > Yes, will do.
> > > >
> > > > Thanks,
> > > > Yu
> > > >
> > > > >
> > > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > > between contiguous logical address and physical address.
> > > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > > expanded for most time as following reasons:
> > > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > > the middle of extent.
> > > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > > extent.
> > > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > > readpage.
> > > > > >
> > > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > > environment of low memory.
> > > > > >
> > > > > > Todo:
> > > > > > *introduce mount option for extent cache.
> > > > > > *add shrink ability for extent cache.
> > > > > >
> > > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > > ---


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-24  8:01           ` Chao Yu
@ 2014-12-25  8:35             ` Jaegeuk Kim
  2014-12-29  7:19               ` Chao Yu
  0 siblings, 1 reply; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-25  8:35 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Wed, Dec 24, 2014 at 04:01:16PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Tuesday, December 23, 2014 3:36 PM
> > To: Chao Yu
> > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Chao,
> > 
> > On Tue, Dec 23, 2014 at 11:01:39AM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > > -----Original Message-----
> > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > Sent: Tuesday, December 23, 2014 7:16 AM
> > > > To: Chao Yu
> > > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Chao,
> > > >
> > > > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > > > Hi Changman,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > > > To: Chao Yu
> > > > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > > >
> > > > > > Hi Yu,
> > > > > >
> > > > > > Good approach.
> > > > >
> > > > > Thank you. :)
> > > > >
> > > > > > As you know, however, f2fs breaks extent itself due to COW.
> > > > >
> > > > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > > > by using this approach we can cache more contiguous mapping extent for better
> > > > > performance.
> > > >
> > > > Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> > > > at all.
> > >
> > > With new implementation of this patch f2fs will build extent cache when readpage/readpages.
> > 
> > I don't understand your points exactly. :(
> > If there are no on-disk extents, it doesn't matter when the caches are built.
> > Could you define what scenarios you're looking at?
> 
> What I mean is that IPU will not split the exist extent in extent cache; and
> this exist extent cache was been built when we init cache with last accessed extent
> (the only on-disk extent) which was stored in inode, or be built when we invoke
> get_data_block in readpage/readpages in IPU mode. So there is a chance to make
> extent in this scenario.

As long as I can understand your point, IPU can mitigate data fragmentation,
resulting in more extents, but current IPU policies except FORCE are activated
only when partition was already fragmented a lot such as high utilization or
SSR modes. That's why I said IPU is actually not helping extent caches.

[snip]

> > > IMO, node page cache belongs to system level cache, filesystem sub system can
> > > not control it completely, cached uptodate node page will be invalidated by
> > > using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
> > > these node page next time.
> > 
> > Yes, that's exactly what I wanted.
> 
> IMO, cost is expensive when we read node page again if these pages are invalidated
> by MM. In the worst case, we will read 3 ind-node blocks + 1 dnode block + 3 NAT blocks
> from disk for one blkaddr.

My point is who should control this sort of caches.
At least, we should focus on the hit ratio, not the worst case comparison.
In the worst case, extent cache has more overhead than node page cache alone.

> 
> > 
> > > New extent cache belongs to filesystem level cache, it is completely controlled
> > > by filesystem itself. What we can profit is: on the one hand, it is used as
> > > first level cache above the node page cache, which can also increase the cache
> > > hit ratio.
> > 
> > I don't think so. The hit ratio depends on the cache policy. The node page
> > cache is managed globally by kernel in LRU manner, so I think this can show
> > affordable hit ratio.
> 
> As I test, in this scenario we can have higher ratio hit in new extent cache
> than original extent cache:
> 1.write a large file
> 2.write randomly in this file
> 3.drop cache through drop_caches entry (or reclaim by MM)
> 4.read this large file
> 
> We cache all segregated extents in inode, so our ratio hit is 100% in
> above scenario. If we add cache policy in extent cache, our hit ratio
> will drop. But added policy can provide us more choose for different hit
> ratio (may affect IO count) requirement of users.

Of course! Even though system wants to get more memory, you're trying to grab
in-house memory for extents.
Or, new shrinker reclaims some of them, but it will drop hit raio obviously.

Please see below as a summary.

> 
> > 
> > > On the other hand, it is more instable and controllable than node page
> > > cache.
> > 
> > It depends on how you can control the extent cache. But, I'm not sure that
> > would be better than page cache managed by MM.
> > 
> > So, my concerns are:
> > 
> > 1. Redundant memory overhead
> >  : The extent cache is likely on top of the node page cache, which will consume
> >  memory redundantly.
> > 
> > 2. CPU overhead
> >  : In every block address updates, it needs to traverse extent cache entries.
> 
> As I mentioned above, if extent cache has the ability of limitation and shrink,
> overhead of memory and CPU can be controlled.
> 
> > 
> > 3. Effectiveness
> >  : We have a node page cache that is managed by MM in LRU order. I think this
> >  provides good hit ratio, system-wide memory relciaming algorithms, and well-
> >  defined locking mechanism.
> 
> The effectiveness of new extent cache is the key point we should discuss.
> 
> IMO, hit ratio of extent cache and memory/CPU cost can be tradeoff.
> For example:
> 1.if limitation is on, max value is 1; shrink is off:
> 	our extent cache will be the same as original extent cache.
> 2.if limitation is off, shrink is off:
> 	our extent cache can provide high hit ratio when node page cache is
> 	invalid but there is more memory/CPU overhead.
> 3.there are more status between label 1 and label 2 as limitation and shrink is
> set to different value.
> So we just develop our extent cache for more functionality and selective.
> 
> Another point is that, now size of struct extent_info is 24 bytes which is much
> smaller than 4096 of node page size (ind-node page size is not calced), it's
> cost-efficient to use small memory to store contiguous mapping. (This is pointed
> out by Changman also)
> 
> > 
> > 4. Cache reclaiming policy
> >  a. global approach: it needs to consider lock contention, CPU overhead, and
> >                      shrinker. I don't think it is better than page cache.
> >  b. local approach: there still exists cold misses at the initial read
> >                     operations. After then, how does the extent cache increase
> > 		    hit ratio more than giving node page cache?
> > 
> > 		    For example, in the case of pretty normal scenario like
> > 		    open -> read -> close -> open -> read ..., we can't get
> > 		    benefits form locally-managed extent cache, while node
> > 		    page cache serves many block addresses.
> 
> If this case should be covered, how about remembering these recently accessed extent
> in a global list when evict, recovering extent cache from list when re-open?

Oh, surely, this example is from the local approach, not existing in
the global one. :)

I guess you've understood my concerns enoughly.
So, could you share a detailed design for all these stuffs?

For the design, I think these must be addressed seriously.
 1. decide one of global or local management
 2. should add a shrinker
 3. reduce lock contention and critical region

Please, limit this feature to consume memory as less as possible, since it will
increase memory footprint definitely.

Thanks,

> 
> Thanks,
> Yu
> 
> > 
> > This is my initial thought on the extent cache.
> > Definitely, it is worth to discuss further in more details.
> > 
> > Thanks,
> > 
> > >
> > > Thanks,
> > > Yu
> > >
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > >
> > > > > > Anyway, mount option could be alternative for this patch.
> > > > >
> > > > > Yes, will do.
> > > > >
> > > > > Thanks,
> > > > > Yu
> > > > >
> > > > > >
> > > > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > > > between contiguous logical address and physical address.
> > > > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > > > expanded for most time as following reasons:
> > > > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > > > the middle of extent.
> > > > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > > > extent.
> > > > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > > > readpage.
> > > > > > >
> > > > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > > > environment of low memory.
> > > > > > >
> > > > > > > Todo:
> > > > > > > *introduce mount option for extent cache.
> > > > > > > *add shrink ability for extent cache.
> > > > > > >
> > > > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > > > ---

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-25  8:35             ` Jaegeuk Kim
@ 2014-12-29  7:19               ` Chao Yu
  2014-12-29 21:23                 ` Jaegeuk Kim
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-29  7:19 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Thursday, December 25, 2014 4:35 PM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Wed, Dec 24, 2014 at 04:01:16PM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Tuesday, December 23, 2014 3:36 PM
> > > To: Chao Yu
> > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Chao,
> > >
> > > On Tue, Dec 23, 2014 at 11:01:39AM +0800, Chao Yu wrote:
> > > > Hi Jaegeuk,
> > > >
> > > > > -----Original Message-----
> > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > Sent: Tuesday, December 23, 2014 7:16 AM
> > > > > To: Chao Yu
> > > > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net;
> linux-kernel@vger.kernel.org
> > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > >
> > > > > Hi Chao,
> > > > >
> > > > > On Mon, Dec 22, 2014 at 03:10:30PM +0800, Chao Yu wrote:
> > > > > > Hi Changman,
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > > > > > Sent: Monday, December 22, 2014 10:03 AM
> > > > > > > To: Chao Yu
> > > > > > > Cc: Jaegeuk Kim; linux-f2fs-devel@lists.sourceforge.net;
> linux-kernel@vger.kernel.org
> > > > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > > > >
> > > > > > > Hi Yu,
> > > > > > >
> > > > > > > Good approach.
> > > > > >
> > > > > > Thank you. :)
> > > > > >
> > > > > > > As you know, however, f2fs breaks extent itself due to COW.
> > > > > >
> > > > > > Yes, and sometimes f2fs use IPU when override writing, in this condition,
> > > > > > by using this approach we can cache more contiguous mapping extent for better
> > > > > > performance.
> > > > >
> > > > > Hmm. When f2fs faces with this case, there is no chance to make an extent itself
> > > > > at all.
> > > >
> > > > With new implementation of this patch f2fs will build extent cache when readpage/readpages.
> > >
> > > I don't understand your points exactly. :(
> > > If there are no on-disk extents, it doesn't matter when the caches are built.
> > > Could you define what scenarios you're looking at?
> >
> > What I mean is that IPU will not split the exist extent in extent cache; and
> > this exist extent cache was been built when we init cache with last accessed extent
> > (the only on-disk extent) which was stored in inode, or be built when we invoke
> > get_data_block in readpage/readpages in IPU mode. So there is a chance to make
> > extent in this scenario.
> 
> As long as I can understand your point, IPU can mitigate data fragmentation,
> resulting in more extents, but current IPU policies except FORCE are activated
> only when partition was already fragmented a lot such as high utilization or
> SSR modes. That's why I said IPU is actually not helping extent caches.

Yeah, I think in a real environment this point you descript is right.

> 
> [snip]
> 
> > > > IMO, node page cache belongs to system level cache, filesystem sub system can
> > > > not control it completely, cached uptodate node page will be invalidated by
> > > > using drop_caches from sysfs, or reclaimer of mm, result in more IO when we need
> > > > these node page next time.
> > >
> > > Yes, that's exactly what I wanted.
> >
> > IMO, cost is expensive when we read node page again if these pages are invalidated
> > by MM. In the worst case, we will read 3 ind-node blocks + 1 dnode block + 3 NAT blocks
> > from disk for one blkaddr.
> 
> My point is who should control this sort of caches.
> At least, we should focus on the hit ratio, not the worst case comparison.
> In the worst case, extent cache has more overhead than node page cache alone.

Yes, so this RFC version extent cache should be improved.

> 
> >
> > >
> > > > New extent cache belongs to filesystem level cache, it is completely controlled
> > > > by filesystem itself. What we can profit is: on the one hand, it is used as
> > > > first level cache above the node page cache, which can also increase the cache
> > > > hit ratio.
> > >
> > > I don't think so. The hit ratio depends on the cache policy. The node page
> > > cache is managed globally by kernel in LRU manner, so I think this can show
> > > affordable hit ratio.
> >
> > As I test, in this scenario we can have higher ratio hit in new extent cache
> > than original extent cache:
> > 1.write a large file
> > 2.write randomly in this file
> > 3.drop cache through drop_caches entry (or reclaim by MM)
> > 4.read this large file
> >
> > We cache all segregated extents in inode, so our ratio hit is 100% in
> > above scenario. If we add cache policy in extent cache, our hit ratio
> > will drop. But added policy can provide us more choose for different hit
> > ratio (may affect IO count) requirement of users.
> 
> Of course! Even though system wants to get more memory, you're trying to grab
> in-house memory for extents.
> Or, new shrinker reclaims some of them, but it will drop hit raio obviously.
> 
> Please see below as a summary.
> 
> >
> > >
> > > > On the other hand, it is more instable and controllable than node page
> > > > cache.
> > >
> > > It depends on how you can control the extent cache. But, I'm not sure that
> > > would be better than page cache managed by MM.
> > >
> > > So, my concerns are:
> > >
> > > 1. Redundant memory overhead
> > >  : The extent cache is likely on top of the node page cache, which will consume
> > >  memory redundantly.
> > >
> > > 2. CPU overhead
> > >  : In every block address updates, it needs to traverse extent cache entries.
> >
> > As I mentioned above, if extent cache has the ability of limitation and shrink,
> > overhead of memory and CPU can be controlled.
> >
> > >
> > > 3. Effectiveness
> > >  : We have a node page cache that is managed by MM in LRU order. I think this
> > >  provides good hit ratio, system-wide memory relciaming algorithms, and well-
> > >  defined locking mechanism.
> >
> > The effectiveness of new extent cache is the key point we should discuss.
> >
> > IMO, hit ratio of extent cache and memory/CPU cost can be tradeoff.
> > For example:
> > 1.if limitation is on, max value is 1; shrink is off:
> > 	our extent cache will be the same as original extent cache.
> > 2.if limitation is off, shrink is off:
> > 	our extent cache can provide high hit ratio when node page cache is
> > 	invalid but there is more memory/CPU overhead.
> > 3.there are more status between label 1 and label 2 as limitation and shrink is
> > set to different value.
> > So we just develop our extent cache for more functionality and selective.
> >
> > Another point is that, now size of struct extent_info is 24 bytes which is much
> > smaller than 4096 of node page size (ind-node page size is not calced), it's
> > cost-efficient to use small memory to store contiguous mapping. (This is pointed
> > out by Changman also)
> >
> > >
> > > 4. Cache reclaiming policy
> > >  a. global approach: it needs to consider lock contention, CPU overhead, and
> > >                      shrinker. I don't think it is better than page cache.
> > >  b. local approach: there still exists cold misses at the initial read
> > >                     operations. After then, how does the extent cache increase
> > > 		    hit ratio more than giving node page cache?
> > >
> > > 		    For example, in the case of pretty normal scenario like
> > > 		    open -> read -> close -> open -> read ..., we can't get
> > > 		    benefits form locally-managed extent cache, while node
> > > 		    page cache serves many block addresses.
> >
> > If this case should be covered, how about remembering these recently accessed extent
> > in a global list when evict, recovering extent cache from list when re-open?
> 
> Oh, surely, this example is from the local approach, not existing in
> the global one. :)
> 
> I guess you've understood my concerns enoughly.
> So, could you share a detailed design for all these stuffs?
> 
> For the design, I think these must be addressed seriously.
>  1. decide one of global or local management
>  2. should add a shrinker
>  3. reduce lock contention and critical region
> 
> Please, limit this feature to consume memory as less as possible, since it will
> increase memory footprint definitely.

Please see the draft below.

1) Extent management:
If we use global management that managing all extents which are from different
inodes in sbi, we will face with serious lock contention when we access these
extents belong to different inodes concurrently, the loss may outweights the
gain. So we choose a local management for extent which means all extents are
managed by inode itself to avoid above lock contention. Addtionlly, we manage
all extents globally by linking all inode into a global lru list for extent
cache shrinker.
Approach:
	a) build extent tree/rwlock/lru list/extent count in each inode.
		*extent tree: link all extent in rb-tree;
		*rwlock: protect fields when accessing extent cache concurrently;
		*lru list: sort all extents in accessing time order;
		*extent count: record total count of extents in cache.
	b) use lru shrink list in sbi to manage all inode which cached extents.
		*inode will be added or repostioned in this global list whenever
		extent is being access in this inode.
		*use spinlock to protect this shrink list.

2) Limitation:
In one inode, as we split or add extent in extent cache when read/write, extent
number will enlarge, so memory and CPU overhead will increase.
In order to control the overhead of memory and CPU, we try to set a upper bound
number to limit total extent number in each inode, This number is global
configuration which is visable to all inode. This number will be exported to
sysfs for configuring according to requirement of user. By default, designed
number is 8.

3) Shrinker:
There are two shrink paths:
	a) one is triggered when extent count has exceed the upper bound of
	inode's extent cache. We will try to release extent(s) from head of
	inode's inner extent lru list until extent count is equal to upper bound.
	This operation could be in f2fs_update_extent_cache().
	b) the other one is triggered when memory util exceed threshold, we try
	get inode from head of global lru list(s), and release extent(s) with
	fixed number (by default: 64 extents) in inode one by one.
	This operation could be in f2fs_balance_fs_bg().

4) Revalidation:
In ->evict(), extent cache will be released, in order to reuse extent cache
of inode when reopen for high hit ratio, a proper way is to add cached extent
tree into a hash table (could be radix tree), revalidate extent tree and recover
it to inode when reopen.
Besides, a global lru list is needed here for shrinker.

5) Readahead:
Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
path. Note, ra can affect lock contention for both ->rwlock in extent cache and
->lock of global shrink list.

To Jaegeuk, Changman,
How do you think? Any suggestion or complaint is welcome. :-)

Thanks,
Yu

> 
> Thanks,
> 
> >
> > Thanks,
> > Yu
> >
> > >
> > > This is my initial thought on the extent cache.
> > > Definitely, it is worth to discuss further in more details.
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > > Yu
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > >
> > > > > > > Anyway, mount option could be alternative for this patch.
> > > > > >
> > > > > > Yes, will do.
> > > > > >
> > > > > > Thanks,
> > > > > > Yu
> > > > > >
> > > > > > >
> > > > > > > On Fri, Dec 19, 2014 at 06:49:29PM +0800, Chao Yu wrote:
> > > > > > > > Now f2fs have page-block mapping cache which can cache only one extent mapping
> > > > > > > > between contiguous logical address and physical address.
> > > > > > > > Normally, this design will work well because f2fs will expand coverage area of
> > > > > > > > the mapping extent when we write forward sequentially. But when we write data
> > > > > > > > randomly in Out-Place-Update mode, the extent will be shorten and hardly be
> > > > > > > > expanded for most time as following reasons:
> > > > > > > > 1.The short part of extent will be discarded if we break contiguous mapping in
> > > > > > > > the middle of extent.
> > > > > > > > 2.The new mapping will be added into mapping cache only at head or tail of the
> > > > > > > > extent.
> > > > > > > > 3.We will drop the extent cache when the extent became very fragmented.
> > > > > > > > 4.We will not update the extent with mapping which we get from readpages or
> > > > > > > > readpage.
> > > > > > > >
> > > > > > > > To solve above problems, this patch adds extent cache base on rb-tree like other
> > > > > > > > filesystems (e.g.: ext4/btrfs) in f2fs. By this way, f2fs can support another
> > > > > > > > more effective cache between dnode page cache and disk. It will supply high hit
> > > > > > > > ratio in the cache with fewer memory when dnode page cache are reclaimed in
> > > > > > > > environment of low memory.
> > > > > > > >
> > > > > > > > Todo:
> > > > > > > > *introduce mount option for extent cache.
> > > > > > > > *add shrink ability for extent cache.
> > > > > > > >
> > > > > > > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > > > > > > ---


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-29  7:19               ` Chao Yu
@ 2014-12-29 21:23                 ` Jaegeuk Kim
  2014-12-30  0:32                   ` Changman Lee
  2014-12-30 10:10                   ` Chao Yu
  0 siblings, 2 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-29 21:23 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:

[snip]

Nice draft. :)

> 
> Please see the draft below.
> 
> 1) Extent management:
> If we use global management that managing all extents which are from different
> inodes in sbi, we will face with serious lock contention when we access these
> extents belong to different inodes concurrently, the loss may outweights the
> gain.

Agreed.

> So we choose a local management for extent which means all extents are
> managed by inode itself to avoid above lock contention. Addtionlly, we manage
> all extents globally by linking all inode into a global lru list for extent
> cache shrinker.
> Approach:
> 	a) build extent tree/rwlock/lru list/extent count in each inode.
> 		*extent tree: link all extent in rb-tree;
> 		*rwlock: protect fields when accessing extent cache concurrently;
> 		*lru list: sort all extents in accessing time order;
> 		*extent count: record total count of extents in cache.
> 	b) use lru shrink list in sbi to manage all inode which cached extents.
> 		*inode will be added or repostioned in this global list whenever
> 		extent is being access in this inode.
> 		*use spinlock to protect this shrink list.

1. How about adding a data structure with inode number instead of referring
inode pointer?

2. How about managing extent entries globally and setting an upper bound to
the number of extent entries instead of limiting them per each inode?
(The rb-tree will handle many extents per inode.)

3. It needs to set a minimum length for the candidate of extent cache.
 (e.g., 64)

So, for example,
struct ino_entry_for_extents {
	inode number;
	rb_tree for extent_entry objects;
	rwlock;
};

struct extent_entry {
	blkaddr, len;
	list_head *;
};

Something like this.

[A, B, C, ... are extent entry]

The sbi has
1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
              ` ino_entry_for_extents (#11) has A, C in rb-tree
              ` ino_entry_for_extents (#12) has F    in rb-tree
              ` ino_entry_for_extents (#13) has G, E in rb-tree

In f2fs_update_extent_cache and __get_data_block for #10,
  ino_entry_for_extents (#10) was founded and updated D or B.
  Then, updated entries are moved to MRU.

In f2fs_evict_inode for #11, A and C are moved to LRU.
But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
should be released.

In f2fs_balance_fs_bg, some LRU extents are released according to the amount
of consumed memory. Then, it frees any ino_entry_for_extents having no extent.

IMO, we don't need to consider readahead for this, since get_data_block will
be called by VFS readahead.

Furthermore, we need to think about whether LRU is really best or not.
IMO, the extent cache aims to improve second access speed, rather than initial
cold misses. So, maybe MRU or another algorithms would be better.

Thanks,

> 
> 2) Limitation:
> In one inode, as we split or add extent in extent cache when read/write, extent
> number will enlarge, so memory and CPU overhead will increase.
> In order to control the overhead of memory and CPU, we try to set a upper bound
> number to limit total extent number in each inode, This number is global
> configuration which is visable to all inode. This number will be exported to
> sysfs for configuring according to requirement of user. By default, designed
> number is 8.
> 
> 3) Shrinker:
> There are two shrink paths:
> 	a) one is triggered when extent count has exceed the upper bound of
> 	inode's extent cache. We will try to release extent(s) from head of
> 	inode's inner extent lru list until extent count is equal to upper bound.
> 	This operation could be in f2fs_update_extent_cache().
> 	b) the other one is triggered when memory util exceed threshold, we try
> 	get inode from head of global lru list(s), and release extent(s) with
> 	fixed number (by default: 64 extents) in inode one by one.
> 	This operation could be in f2fs_balance_fs_bg().
> 
> 4) Revalidation:
> In ->evict(), extent cache will be released, in order to reuse extent cache
> of inode when reopen for high hit ratio, a proper way is to add cached extent
> tree into a hash table (could be radix tree), revalidate extent tree and recover
> it to inode when reopen.
> Besides, a global lru list is needed here for shrinker.
> 
> 5) Readahead:
> Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> ->lock of global shrink list.
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-29 21:23                 ` Jaegeuk Kim
@ 2014-12-30  0:32                   ` Changman Lee
  2015-01-04  3:19                     ` Chao Yu
  2014-12-30 10:10                   ` Chao Yu
  1 sibling, 1 reply; 26+ messages in thread
From: Changman Lee @ 2014-12-30  0:32 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: Chao Yu, linux-f2fs-devel, linux-kernel

Hi all,

On Mon, Dec 29, 2014 at 01:23:00PM -0800, Jaegeuk Kim wrote:
> Hi Chao,
> 
> On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> 
> [snip]
> 
> Nice draft. :)
> 
> > 
> > Please see the draft below.
> > 
> > 1) Extent management:
> > If we use global management that managing all extents which are from different
> > inodes in sbi, we will face with serious lock contention when we access these
> > extents belong to different inodes concurrently, the loss may outweights the
> > gain.
> 
> Agreed.
> 
> > So we choose a local management for extent which means all extents are
> > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > all extents globally by linking all inode into a global lru list for extent
> > cache shrinker.
> > Approach:
> > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > 		*extent tree: link all extent in rb-tree;
> > 		*rwlock: protect fields when accessing extent cache concurrently;
> > 		*lru list: sort all extents in accessing time order;
> > 		*extent count: record total count of extents in cache.
> > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > 		*inode will be added or repostioned in this global list whenever
> > 		extent is being access in this inode.
> > 		*use spinlock to protect this shrink list.
> 
> 1. How about adding a data structure with inode number instead of referring
> inode pointer?
> 
> 2. How about managing extent entries globally and setting an upper bound to
> the number of extent entries instead of limiting them per each inode?
> (The rb-tree will handle many extents per inode.)
> 
> 3. It needs to set a minimum length for the candidate of extent cache.
>  (e.g., 64)
> 

Agreed.

> So, for example,
> struct ino_entry_for_extents {
> 	inode number;
> 	rb_tree for extent_entry objects;
> 	rwlock;
> };
> 
> struct extent_entry {
> 	blkaddr, len;
> 	list_head *;
> };
> 
> Something like this.
> 
> [A, B, C, ... are extent entry]
> 
> The sbi has
> 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
>               ` ino_entry_for_extents (#11) has A, C in rb-tree
>               ` ino_entry_for_extents (#12) has F    in rb-tree
>               ` ino_entry_for_extents (#13) has G, E in rb-tree
> 
> In f2fs_update_extent_cache and __get_data_block for #10,
>   ino_entry_for_extents (#10) was founded and updated D or B.
>   Then, updated entries are moved to MRU.
> 
> In f2fs_evict_inode for #11, A and C are moved to LRU.
> But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> should be released.
> 
> In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> 
> IMO, we don't need to consider readahead for this, since get_data_block will
> be called by VFS readahead.
> 
> Furthermore, we need to think about whether LRU is really best or not.
> IMO, the extent cache aims to improve second access speed, rather than initial
> cold misses. So, maybe MRU or another algorithms would be better.
> 

Right. It's very comflicated to judge which is better.
In read or write path, extents could be made every time. At that time, we should
decide which extent evicts instead of new extents if we set upper bound.
In update, one extent could be seperated into 3. It requires 3 insertion and 1 deletion.
So if update happends frequently, we could give up extent management for some ranges.
And we need to bring ideas from vm managemnt. For example,
active/inactive list and second chance to promotion, or batch work for insertion/deletion

I thought suddenly 'Simple is best'.
Let's think about better ideas together.

> Thanks,
> 
> > 
> > 2) Limitation:
> > In one inode, as we split or add extent in extent cache when read/write, extent
> > number will enlarge, so memory and CPU overhead will increase.
> > In order to control the overhead of memory and CPU, we try to set a upper bound
> > number to limit total extent number in each inode, This number is global
> > configuration which is visable to all inode. This number will be exported to
> > sysfs for configuring according to requirement of user. By default, designed
> > number is 8.
> > 

Chao,
It's better which # of extent are controlled globally rather than limit extents
per inode as Jaegeuk said to reduce extent management overhead.

> > 3) Shrinker:
> > There are two shrink paths:
> > 	a) one is triggered when extent count has exceed the upper bound of
> > 	inode's extent cache. We will try to release extent(s) from head of
> > 	inode's inner extent lru list until extent count is equal to upper bound.
> > 	This operation could be in f2fs_update_extent_cache().
> > 	b) the other one is triggered when memory util exceed threshold, we try
> > 	get inode from head of global lru list(s), and release extent(s) with
> > 	fixed number (by default: 64 extents) in inode one by one.
> > 	This operation could be in f2fs_balance_fs_bg().
> > 

Let's consider to use register_shrinker which is called by vm when
memory pressure happens.

> > 4) Revalidation:
> > In ->evict(), extent cache will be released, in order to reuse extent cache
> > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > it to inode when reopen.
> > Besides, a global lru list is needed here for shrinker.
> > 
> > 5) Readahead:
> > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > ->lock of global shrink list.
> > 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-29 21:23                 ` Jaegeuk Kim
  2014-12-30  0:32                   ` Changman Lee
@ 2014-12-30 10:10                   ` Chao Yu
  2014-12-31  8:26                     ` Jaegeuk Kim
  1 sibling, 1 reply; 26+ messages in thread
From: Chao Yu @ 2014-12-30 10:10 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Tuesday, December 30, 2014 5:23 AM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> 
> [snip]
> 
> Nice draft. :)

Thanks! :)

> 
> >
> > Please see the draft below.
> >
> > 1) Extent management:
> > If we use global management that managing all extents which are from different
> > inodes in sbi, we will face with serious lock contention when we access these
> > extents belong to different inodes concurrently, the loss may outweights the
> > gain.
> 
> Agreed.
> 
> > So we choose a local management for extent which means all extents are
> > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > all extents globally by linking all inode into a global lru list for extent
> > cache shrinker.
> > Approach:
> > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > 		*extent tree: link all extent in rb-tree;
> > 		*rwlock: protect fields when accessing extent cache concurrently;
> > 		*lru list: sort all extents in accessing time order;
> > 		*extent count: record total count of extents in cache.
> > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > 		*inode will be added or repostioned in this global list whenever
> > 		extent is being access in this inode.
> > 		*use spinlock to protect this shrink list.
> 
> 1. How about adding a data structure with inode number instead of referring
> inode pointer?
> 
> 2. How about managing extent entries globally and setting an upper bound to
> the number of extent entries instead of limiting them per each inode?
> (The rb-tree will handle many extents per inode.)

[assumption]
Approach A: global lru list;
Approach B: lru list per inode.

I have considered Approach A before writing the draft, although Approach A could
provide us higher ratio hit due to global lru, it may make lock contention more
intensively and has more memory overhead descripted below. So I choose more
conservative Approach B.
1) as extent_entry will split in Cow, we may lock extent_list more times when
move these separated extent entries in extent_list, unless we have batch mode
interface.
2) lock overhead between shrinker and lookuper and updater.
e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
(#1) has A, C, E, G in rb-tree
(#2) has B, D, F in rb-tree
shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
	...
*trylock/unlock* for all free extent entries belong to one inode will be separated
to lots of parts, resulting in more contention.
3) it has more memory overhead in each extent entry:
Approach A: need add inode number for backref and list_head *
Approach B: need add list_head *

Or maybe it's not a problem because we can afford all these above.

Anyway, I'm not against.

> 
> 3. It needs to set a minimum length for the candidate of extent cache.
>  (e.g., 64)

Is this used for shrinker? Can you please explain more about this minimum length?

> 
> So, for example,
> struct ino_entry_for_extents {
> 	inode number;
> 	rb_tree for extent_entry objects;
> 	rwlock;
> };
> 
> struct extent_entry {
> 	blkaddr, len;
> 	list_head *;

We should add one other field 'inode number' here, as through it we can get
rb-tree root from ino_entry_for_extents for rb_erase when we free the extent
entry in shrinker.

> };
> 
> Something like this.
> 
> [A, B, C, ... are extent entry]
> 
> The sbi has
> 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
>               ` ino_entry_for_extents (#11) has A, C in rb-tree
>               ` ino_entry_for_extents (#12) has F    in rb-tree
>               ` ino_entry_for_extents (#13) has G, E in rb-tree
> 
> In f2fs_update_extent_cache and __get_data_block for #10,
>   ino_entry_for_extents (#10) was founded and updated D or B.
>   Then, updated entries are moved to MRU.
> 
> In f2fs_evict_inode for #11, A and C are moved to LRU.
> But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> should be released.
> 
> In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> 
> IMO, we don't need to consider readahead for this, since get_data_block will
> be called by VFS readahead.

In get_data_block we can cache only one blkaddr once after get_dnode_of_data
returned, it seems less efficient. So why not readahead selectively more
contiguous valid blkaddr in get_dnode_of_data once?

> 
> Furthermore, we need to think about whether LRU is really best or not.
> IMO, the extent cache aims to improve second access speed, rather than initial
> cold misses. So, maybe MRU or another algorithms would be better.

Agree, let's rethink about this. :)

Thanks,

> 
> Thanks,
> 
> >
> > 2) Limitation:
> > In one inode, as we split or add extent in extent cache when read/write, extent
> > number will enlarge, so memory and CPU overhead will increase.
> > In order to control the overhead of memory and CPU, we try to set a upper bound
> > number to limit total extent number in each inode, This number is global
> > configuration which is visable to all inode. This number will be exported to
> > sysfs for configuring according to requirement of user. By default, designed
> > number is 8.
> >
> > 3) Shrinker:
> > There are two shrink paths:
> > 	a) one is triggered when extent count has exceed the upper bound of
> > 	inode's extent cache. We will try to release extent(s) from head of
> > 	inode's inner extent lru list until extent count is equal to upper bound.
> > 	This operation could be in f2fs_update_extent_cache().
> > 	b) the other one is triggered when memory util exceed threshold, we try
> > 	get inode from head of global lru list(s), and release extent(s) with
> > 	fixed number (by default: 64 extents) in inode one by one.
> > 	This operation could be in f2fs_balance_fs_bg().
> >
> > 4) Revalidation:
> > In ->evict(), extent cache will be released, in order to reuse extent cache
> > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > it to inode when reopen.
> > Besides, a global lru list is needed here for shrinker.
> >
> > 5) Readahead:
> > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > ->lock of global shrink list.
> >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-30 10:10                   ` Chao Yu
@ 2014-12-31  8:26                     ` Jaegeuk Kim
  2015-01-04  8:24                       ` Chao Yu
  0 siblings, 1 reply; 26+ messages in thread
From: Jaegeuk Kim @ 2014-12-31  8:26 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Tue, Dec 30, 2014 at 06:10:21PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Tuesday, December 30, 2014 5:23 AM
> > To: Chao Yu
> > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Chao,
> > 
> > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > 
> > [snip]
> > 
> > Nice draft. :)
> 
> Thanks! :)
> 
> > 
> > >
> > > Please see the draft below.
> > >
> > > 1) Extent management:
> > > If we use global management that managing all extents which are from different
> > > inodes in sbi, we will face with serious lock contention when we access these
> > > extents belong to different inodes concurrently, the loss may outweights the
> > > gain.
> > 
> > Agreed.
> > 
> > > So we choose a local management for extent which means all extents are
> > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > all extents globally by linking all inode into a global lru list for extent
> > > cache shrinker.
> > > Approach:
> > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > 		*extent tree: link all extent in rb-tree;
> > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > 		*lru list: sort all extents in accessing time order;
> > > 		*extent count: record total count of extents in cache.
> > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > 		*inode will be added or repostioned in this global list whenever
> > > 		extent is being access in this inode.
> > > 		*use spinlock to protect this shrink list.
> > 
> > 1. How about adding a data structure with inode number instead of referring
> > inode pointer?
> > 
> > 2. How about managing extent entries globally and setting an upper bound to
> > the number of extent entries instead of limiting them per each inode?
> > (The rb-tree will handle many extents per inode.)
> 
> [assumption]
> Approach A: global lru list;
> Approach B: lru list per inode.
> 
> I have considered Approach A before writing the draft, although Approach A could
> provide us higher ratio hit due to global lru, it may make lock contention more
> intensively and has more memory overhead descripted below. So I choose more
> conservative Approach B.
> 1) as extent_entry will split in Cow, we may lock extent_list more times when
> move these separated extent entries in extent_list, unless we have batch mode
> interface.
> 2) lock overhead between shrinker and lookuper and updater.
> e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> (#1) has A, C, E, G in rb-tree
> (#2) has B, D, F in rb-tree
> shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
> 	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
> 	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
> 	...

lock_list -> list_del(A) -> unlock_list -> lock #1 -> remove_tree(A) -> unlock #1
-> free A.

Or,
lock_list -> list_del(A, B, C) -> unlock_list -> lock #1 -> remove_tree(A, C) ->
unlock #1 -> free A, C -> lock #2 -> remove_tree(B) -> unlock #2

I think a couple of list operations do not cause severe lock contention.
And, if we assing minimum length for extent caches, the contention would be
distributed.

This means that, it needs to manage each of all the extents in the cache should
have the length larger than minimum value.

> *trylock/unlock* for all free extent entries belong to one inode will be separated
> to lots of parts, resulting in more contention.
> 3) it has more memory overhead in each extent entry:
> Approach A: need add inode number for backref and list_head *

It doesn't need to add ino. Just remain them, and shrinker can remove empty
ino_entry_for_extents periodically.

> Approach B: need add list_head *
> 
> Or maybe it's not a problem because we can afford all these above.
> 
> Anyway, I'm not against.
> 
> > 
> > 3. It needs to set a minimum length for the candidate of extent cache.
> >  (e.g., 64)
> 
> Is this used for shrinker? Can you please explain more about this minimum length?
> 
> > 
> > So, for example,
> > struct ino_entry_for_extents {
> > 	inode number;
> > 	rb_tree for extent_entry objects;
> > 	rwlock;
> > };
> > 
> > struct extent_entry {
> > 	blkaddr, len;
> > 	list_head *;
> 
> We should add one other field 'inode number' here, as through it we can get
> rb-tree root from ino_entry_for_extents for rb_erase when we free the extent
> entry in shrinker.
> 
> > };
> > 
> > Something like this.
> > 
> > [A, B, C, ... are extent entry]
> > 
> > The sbi has
> > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> >               ` ino_entry_for_extents (#12) has F    in rb-tree
> >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > 
> > In f2fs_update_extent_cache and __get_data_block for #10,
> >   ino_entry_for_extents (#10) was founded and updated D or B.
> >   Then, updated entries are moved to MRU.
> > 
> > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > should be released.
> > 
> > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > 
> > IMO, we don't need to consider readahead for this, since get_data_block will
> > be called by VFS readahead.
> 
> In get_data_block we can cache only one blkaddr once after get_dnode_of_data
> returned, it seems less efficient. So why not readahead selectively more
> contiguous valid blkaddr in get_dnode_of_data once?

Why do you think only one blkaddr?
get_data_block searches extents as many as possible.

Thanks,

> 
> > 
> > Furthermore, we need to think about whether LRU is really best or not.
> > IMO, the extent cache aims to improve second access speed, rather than initial
> > cold misses. So, maybe MRU or another algorithms would be better.
> 
> Agree, let's rethink about this. :)
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > >
> > > 2) Limitation:
> > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > number will enlarge, so memory and CPU overhead will increase.
> > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > number to limit total extent number in each inode, This number is global
> > > configuration which is visable to all inode. This number will be exported to
> > > sysfs for configuring according to requirement of user. By default, designed
> > > number is 8.
> > >
> > > 3) Shrinker:
> > > There are two shrink paths:
> > > 	a) one is triggered when extent count has exceed the upper bound of
> > > 	inode's extent cache. We will try to release extent(s) from head of
> > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > 	This operation could be in f2fs_update_extent_cache().
> > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > 	get inode from head of global lru list(s), and release extent(s) with
> > > 	fixed number (by default: 64 extents) in inode one by one.
> > > 	This operation could be in f2fs_balance_fs_bg().
> > >
> > > 4) Revalidation:
> > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > it to inode when reopen.
> > > Besides, a global lru list is needed here for shrinker.
> > >
> > > 5) Readahead:
> > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > ->lock of global shrink list.
> > >

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-30  0:32                   ` Changman Lee
@ 2015-01-04  3:19                     ` Chao Yu
  2015-01-07 11:16                       ` Changman Lee
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2015-01-04  3:19 UTC (permalink / raw)
  To: 'Changman Lee', 'Jaegeuk Kim'
  Cc: linux-f2fs-devel, linux-kernel

Hi Changman,

Sorry for replying late!

> -----Original Message-----
> From: Changman Lee [mailto:cm224.lee@samsung.com]
> Sent: Tuesday, December 30, 2014 8:32 AM
> To: Jaegeuk Kim
> Cc: Chao Yu; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi all,
> 
> On Mon, Dec 29, 2014 at 01:23:00PM -0800, Jaegeuk Kim wrote:
> > Hi Chao,
> >
> > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> >
> > [snip]
> >
> > Nice draft. :)
> >
> > >
> > > Please see the draft below.
> > >
> > > 1) Extent management:
> > > If we use global management that managing all extents which are from different
> > > inodes in sbi, we will face with serious lock contention when we access these
> > > extents belong to different inodes concurrently, the loss may outweights the
> > > gain.
> >
> > Agreed.
> >
> > > So we choose a local management for extent which means all extents are
> > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > all extents globally by linking all inode into a global lru list for extent
> > > cache shrinker.
> > > Approach:
> > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > 		*extent tree: link all extent in rb-tree;
> > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > 		*lru list: sort all extents in accessing time order;
> > > 		*extent count: record total count of extents in cache.
> > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > 		*inode will be added or repostioned in this global list whenever
> > > 		extent is being access in this inode.
> > > 		*use spinlock to protect this shrink list.
> >
> > 1. How about adding a data structure with inode number instead of referring
> > inode pointer?
> >
> > 2. How about managing extent entries globally and setting an upper bound to
> > the number of extent entries instead of limiting them per each inode?
> > (The rb-tree will handle many extents per inode.)
> >
> > 3. It needs to set a minimum length for the candidate of extent cache.
> >  (e.g., 64)
> >
> 
> Agreed.
> 
> > So, for example,
> > struct ino_entry_for_extents {
> > 	inode number;
> > 	rb_tree for extent_entry objects;
> > 	rwlock;
> > };
> >
> > struct extent_entry {
> > 	blkaddr, len;
> > 	list_head *;
> > };
> >
> > Something like this.
> >
> > [A, B, C, ... are extent entry]
> >
> > The sbi has
> > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> >               ` ino_entry_for_extents (#12) has F    in rb-tree
> >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> >
> > In f2fs_update_extent_cache and __get_data_block for #10,
> >   ino_entry_for_extents (#10) was founded and updated D or B.
> >   Then, updated entries are moved to MRU.
> >
> > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > should be released.
> >
> > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> >
> > IMO, we don't need to consider readahead for this, since get_data_block will
> > be called by VFS readahead.
> >
> > Furthermore, we need to think about whether LRU is really best or not.
> > IMO, the extent cache aims to improve second access speed, rather than initial
> > cold misses. So, maybe MRU or another algorithms would be better.
> >
> 
> Right. It's very comflicated to judge which is better.
> In read or write path, extents could be made every time. At that time, we should
> decide which extent evicts instead of new extents if we set upper bound.
> In update, one extent could be seperated into 3. It requires 3 insertion and 1 deletion.
> So if update happends frequently, we could give up extent management for some ranges.
> And we need to bring ideas from vm managemnt. For example,
> active/inactive list and second chance to promotion, or batch work for insertion/deletion
> 
> I thought suddenly 'Simple is best'.
> Let's think about better ideas together.

Yeah, how about using an opposite way to the way of page cache manager?

for example:
node page A,B,C,D is in page cache;
extent a,b,c,d is in extent cache;
extent a is built from page A, ..., d is built from page D.
page cache: LRU A -> B -> C -> D MRU
extent cache: LRU a -> b -> c -> d MRU

If we use
1) the same way LRU, cache pair A-a, B-b, ... may be reclaimed in the same time as OOM.
2) the opposite way, maybe A,B in page cache and d,c in extent cache will be reclaimed,
but we still can hit whole cache in combination of page cache and extent cache.

So by the way '2)' we can increase the total coverage area of page cache + extent cache.

> 
> > Thanks,
> >
> > >
> > > 2) Limitation:
> > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > number will enlarge, so memory and CPU overhead will increase.
> > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > number to limit total extent number in each inode, This number is global
> > > configuration which is visable to all inode. This number will be exported to
> > > sysfs for configuring according to requirement of user. By default, designed
> > > number is 8.
> > >
> 
> Chao,
> It's better which # of extent are controlled globally rather than limit extents
> per inode as Jaegeuk said to reduce extent management overhead.

It's OK.

> 
> > > 3) Shrinker:
> > > There are two shrink paths:
> > > 	a) one is triggered when extent count has exceed the upper bound of
> > > 	inode's extent cache. We will try to release extent(s) from head of
> > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > 	This operation could be in f2fs_update_extent_cache().
> > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > 	get inode from head of global lru list(s), and release extent(s) with
> > > 	fixed number (by default: 64 extents) in inode one by one.
> > > 	This operation could be in f2fs_balance_fs_bg().
> > >
> 
> Let's consider to use register_shrinker which is called by vm when
> memory pressure happens.

Great, thanks for your reminding! :-)

Thanks,
Yu

> 
> > > 4) Revalidation:
> > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > it to inode when reopen.
> > > Besides, a global lru list is needed here for shrinker.
> > >
> > > 5) Readahead:
> > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > ->lock of global shrink list.
> > >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2014-12-31  8:26                     ` Jaegeuk Kim
@ 2015-01-04  8:24                       ` Chao Yu
  2015-01-06 23:00                           ` Jaegeuk Kim
  0 siblings, 1 reply; 26+ messages in thread
From: Chao Yu @ 2015-01-04  8:24 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Wednesday, December 31, 2014 4:26 PM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Tue, Dec 30, 2014 at 06:10:21PM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Tuesday, December 30, 2014 5:23 AM
> > > To: Chao Yu
> > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Chao,
> > >
> > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > >
> > > [snip]
> > >
> > > Nice draft. :)
> >
> > Thanks! :)
> >
> > >
> > > >
> > > > Please see the draft below.
> > > >
> > > > 1) Extent management:
> > > > If we use global management that managing all extents which are from different
> > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > gain.
> > >
> > > Agreed.
> > >
> > > > So we choose a local management for extent which means all extents are
> > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > all extents globally by linking all inode into a global lru list for extent
> > > > cache shrinker.
> > > > Approach:
> > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > 		*extent tree: link all extent in rb-tree;
> > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > 		*lru list: sort all extents in accessing time order;
> > > > 		*extent count: record total count of extents in cache.
> > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > 		*inode will be added or repostioned in this global list whenever
> > > > 		extent is being access in this inode.
> > > > 		*use spinlock to protect this shrink list.
> > >
> > > 1. How about adding a data structure with inode number instead of referring
> > > inode pointer?
> > >
> > > 2. How about managing extent entries globally and setting an upper bound to
> > > the number of extent entries instead of limiting them per each inode?
> > > (The rb-tree will handle many extents per inode.)
> >
> > [assumption]
> > Approach A: global lru list;
> > Approach B: lru list per inode.
> >
> > I have considered Approach A before writing the draft, although Approach A could
> > provide us higher ratio hit due to global lru, it may make lock contention more
> > intensively and has more memory overhead descripted below. So I choose more
> > conservative Approach B.
> > 1) as extent_entry will split in Cow, we may lock extent_list more times when
> > move these separated extent entries in extent_list, unless we have batch mode
> > interface.
> > 2) lock overhead between shrinker and lookuper and updater.
> > e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > (#1) has A, C, E, G in rb-tree
> > (#2) has B, D, F in rb-tree
> > shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
> > 	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
> > 	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
> > 	...
> 
> lock_list -> list_del(A) -> unlock_list -> lock #1 -> remove_tree(A) -> unlock #1
> -> free A.

If unlock_list before lock #1, (A) could be modified by other thread between ->unlock_list
and lock #1, so one more "lookup_tree(A)" is needed under lock #1:
lock #1 -> lookup_tree(A) -> remove_tree(A) -> unlock #1

> 
> Or,
> lock_list -> list_del(A, B, C) -> unlock_list -> lock #1 -> remove_tree(A, C) ->
> unlock #1 -> free A, C -> lock #2 -> remove_tree(B) -> unlock #2

ditto.
To avoid unneeded lookup_tree(A), maybe we can use this series:

lock list -> get A from head -> trylock #1 -> try to get more node(C, E, G) of #1 in list
-> unlock list -> remove_tree&free(A, C, E, G) -> unlock #1

> 
> I think a couple of list operations do not cause severe lock contention.

Yeah, batch operation is better.
But sometimes random write break extent into everywhere in shrinker's list,
so batch operation can gain less in this condition.

> And, if we assing minimum length for extent caches, the contention would be
> distributed.
> 
> This means that, it needs to manage each of all the extents in the cache should
> have the length larger than minimum value.

If I do not misunderstand, what you mean is that if extent number in one inode cache
is bigger than minimum value, then, we will start to add all extents of this inode
into shrinker's list, and reposition them in the list when f2fs_{lookup,update}_extent_cache?

> 
> > *trylock/unlock* for all free extent entries belong to one inode will be separated
> > to lots of parts, resulting in more contention.
> > 3) it has more memory overhead in each extent entry:
> > Approach A: need add inode number for backref and list_head *
> 
> It doesn't need to add ino. Just remain them, and shrinker can remove empty
> ino_entry_for_extents periodically.

I do not understand why we don't need ino, :(.
Without ino, how can we get the rb-tree root pointer for rb_erase(node, root)?

> 
> > Approach B: need add list_head *
> >
> > Or maybe it's not a problem because we can afford all these above.
> >
> > Anyway, I'm not against.
> >
> > >
> > > 3. It needs to set a minimum length for the candidate of extent cache.
> > >  (e.g., 64)
> >
> > Is this used for shrinker? Can you please explain more about this minimum length?
> >
> > >
> > > So, for example,
> > > struct ino_entry_for_extents {
> > > 	inode number;
> > > 	rb_tree for extent_entry objects;
> > > 	rwlock;
> > > };
> > >
> > > struct extent_entry {
> > > 	blkaddr, len;
> > > 	list_head *;
> >
> > We should add one other field 'inode number' here, as through it we can get
> > rb-tree root from ino_entry_for_extents for rb_erase when we free the extent
> > entry in shrinker.
> >
> > > };
> > >
> > > Something like this.
> > >
> > > [A, B, C, ... are extent entry]
> > >
> > > The sbi has
> > > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> > >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> > >               ` ino_entry_for_extents (#12) has F    in rb-tree
> > >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > >
> > > In f2fs_update_extent_cache and __get_data_block for #10,
> > >   ino_entry_for_extents (#10) was founded and updated D or B.
> > >   Then, updated entries are moved to MRU.
> > >
> > > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > > should be released.
> > >
> > > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > >
> > > IMO, we don't need to consider readahead for this, since get_data_block will
> > > be called by VFS readahead.
> >
> > In get_data_block we can cache only one blkaddr once after get_dnode_of_data
> > returned, it seems less efficient. So why not readahead selectively more
> > contiguous valid blkaddr in get_dnode_of_data once?
> 
> Why do you think only one blkaddr?
> get_data_block searches extents as many as possible.

Surely not one blkaddr.

Actually, what I mean is that:
->get_dnode_of_data
  ->f2fs_ra_extent_cache()

f2fs_ra_extent_cache(node_page, offset) {
	for (i = offset; i < max; i++) {
		if (is_valid() && is_contiguous())
			len++;
	}
	f2fs_update_extent_cache(inode, fofs, blk_addr, len)
}

But not:
->__get_data_block
  ->get_dnode_of_data
  ->f2fs_update_extent_cache(inode, fofs, blk_addr, 1);

Thanks,
Yu

> 
> Thanks,
> 
> >
> > >
> > > Furthermore, we need to think about whether LRU is really best or not.
> > > IMO, the extent cache aims to improve second access speed, rather than initial
> > > cold misses. So, maybe MRU or another algorithms would be better.
> >
> > Agree, let's rethink about this. :)
> >
> > Thanks,
> >
> > >
> > > Thanks,
> > >
> > > >
> > > > 2) Limitation:
> > > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > > number will enlarge, so memory and CPU overhead will increase.
> > > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > > number to limit total extent number in each inode, This number is global
> > > > configuration which is visable to all inode. This number will be exported to
> > > > sysfs for configuring according to requirement of user. By default, designed
> > > > number is 8.
> > > >
> > > > 3) Shrinker:
> > > > There are two shrink paths:
> > > > 	a) one is triggered when extent count has exceed the upper bound of
> > > > 	inode's extent cache. We will try to release extent(s) from head of
> > > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > > 	This operation could be in f2fs_update_extent_cache().
> > > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > > 	get inode from head of global lru list(s), and release extent(s) with
> > > > 	fixed number (by default: 64 extents) in inode one by one.
> > > > 	This operation could be in f2fs_balance_fs_bg().
> > > >
> > > > 4) Revalidation:
> > > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > > it to inode when reopen.
> > > > Besides, a global lru list is needed here for shrinker.
> > > >
> > > > 5) Readahead:
> > > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > > ->lock of global shrink list.
> > > >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2015-01-04  8:24                       ` Chao Yu
@ 2015-01-06 23:00                           ` Jaegeuk Kim
  0 siblings, 0 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2015-01-06 23:00 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Chao,

On Sun, Jan 04, 2015 at 04:24:10PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Wednesday, December 31, 2014 4:26 PM
> > To: Chao Yu
> > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Chao,
> > 
> > On Tue, Dec 30, 2014 at 06:10:21PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > > -----Original Message-----
> > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > Sent: Tuesday, December 30, 2014 5:23 AM
> > > > To: Chao Yu
> > > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Chao,
> > > >
> > > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > > >
> > > > [snip]
> > > >
> > > > Nice draft. :)
> > >
> > > Thanks! :)
> > >
> > > >
> > > > >
> > > > > Please see the draft below.
> > > > >
> > > > > 1) Extent management:
> > > > > If we use global management that managing all extents which are from different
> > > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > > gain.
> > > >
> > > > Agreed.
> > > >
> > > > > So we choose a local management for extent which means all extents are
> > > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > > all extents globally by linking all inode into a global lru list for extent
> > > > > cache shrinker.
> > > > > Approach:
> > > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > > 		*extent tree: link all extent in rb-tree;
> > > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > > 		*lru list: sort all extents in accessing time order;
> > > > > 		*extent count: record total count of extents in cache.
> > > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > > 		*inode will be added or repostioned in this global list whenever
> > > > > 		extent is being access in this inode.
> > > > > 		*use spinlock to protect this shrink list.
> > > >
> > > > 1. How about adding a data structure with inode number instead of referring
> > > > inode pointer?
> > > >
> > > > 2. How about managing extent entries globally and setting an upper bound to
> > > > the number of extent entries instead of limiting them per each inode?
> > > > (The rb-tree will handle many extents per inode.)
> > >
> > > [assumption]
> > > Approach A: global lru list;
> > > Approach B: lru list per inode.
> > >
> > > I have considered Approach A before writing the draft, although Approach A could
> > > provide us higher ratio hit due to global lru, it may make lock contention more
> > > intensively and has more memory overhead descripted below. So I choose more
> > > conservative Approach B.
> > > 1) as extent_entry will split in Cow, we may lock extent_list more times when
> > > move these separated extent entries in extent_list, unless we have batch mode
> > > interface.
> > > 2) lock overhead between shrinker and lookuper and updater.
> > > e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > (#1) has A, C, E, G in rb-tree
> > > (#2) has B, D, F in rb-tree
> > > shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
> > > 	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
> > > 	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
> > > 	...
> > 
> > lock_list -> list_del(A) -> unlock_list -> lock #1 -> remove_tree(A) -> unlock #1
> > -> free A.
> 
> If unlock_list before lock #1, (A) could be modified by other thread between ->unlock_list
> and lock #1, so one more "lookup_tree(A)" is needed under lock #1:
> lock #1 -> lookup_tree(A) -> remove_tree(A) -> unlock #1
> 
> > 
> > Or,
> > lock_list -> list_del(A, B, C) -> unlock_list -> lock #1 -> remove_tree(A, C) ->
> > unlock #1 -> free A, C -> lock #2 -> remove_tree(B) -> unlock #2
> 
> ditto.
> To avoid unneeded lookup_tree(A), maybe we can use this series:
> 
> lock list -> get A from head -> trylock #1 -> try to get more node(C, E, G) of #1 in list
> -> unlock list -> remove_tree&free(A, C, E, G) -> unlock #1
> 
> > 
> > I think a couple of list operations do not cause severe lock contention.
> 
> Yeah, batch operation is better.
> But sometimes random write break extent into everywhere in shrinker's list,
> so batch operation can gain less in this condition.
> 
> > And, if we assing minimum length for extent caches, the contention would be
> > distributed.
> > 
> > This means that, it needs to manage each of all the extents in the cache should
> > have the length larger than minimum value.
> 
> If I do not misunderstand, what you mean is that if extent number in one inode cache
> is bigger than minimum value, then, we will start to add all extents of this inode
> into shrinker's list, and reposition them in the list when f2fs_{lookup,update}_extent_cache?
> 
> > 
> > > *trylock/unlock* for all free extent entries belong to one inode will be separated
> > > to lots of parts, resulting in more contention.
> > > 3) it has more memory overhead in each extent entry:
> > > Approach A: need add inode number for backref and list_head *
> > 
> > It doesn't need to add ino. Just remain them, and shrinker can remove empty
> > ino_entry_for_extents periodically.
> 
> I do not understand why we don't need ino, :(.
> Without ino, how can we get the rb-tree root pointer for rb_erase(node, root)?

Let me describe in more detail.

For example,

- global radix tree in sbi
  --> ino #1 --> rb_tree (rb_tree, rb_tree_lock, refcount=0)
  --> ino #2 --> rb_tree
  --> ino #3 --> rb_tree
                    `-> extent A 
		    `-> extent B (fofs, blkaddr, length, *list_head=empty)

- global extent list in sbi

  extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
  (#1) has A, C, E, G in rb-tree
  (#2) has B, D, F in rb-tree

1) update extents (#1)

 - lock_radix_tree
     if (notfound #1)
         allocate #1;
     #1's refcount++;
 - unlock_radix_tree

 - lock_rb_tree (#1)
     update A
     rb_delete E, if E->length is smaller than MIN_LEN.
     allocate & rb_add K, if K->length is larget than MIN_LEN.

     list_lock
     list_move(A)
     list_del(E)
     list_add_tail(K)
     list_unlock
 - unlock_rb_tree (#1)

 - #1's rafcount--;

2) shrinker

 - list_lock
     list_del A, B, C, ...
 - list_unlock

 - gang_look_up_radix_tree_with_lock {

     #N's refcount++;

     lock_rb_tree (#N)
         for_each_rb_node {
             if (list_empty(extent->list_head)) {
                 remove_rb_node(extent);
                 free(extent);
             }
	 }
     unlock_rb_tree (#N)

     #N's refcount--;
 - }

 - for_each_radix_tree_with_lock {
     if (radix_node->refcount == 0 && rb_tree_is_empty)
         free(radix_node);
 - }

3) lookup extents (#1)

 - lock_radix_tree
     if (notfound #1)
         goto out;
     #1's refcount++;
 - unlock_radix_tree

 - lock_rb_tree (#1)
     found extent A

     list_lock
     list_move(A)
     list_unlock
 - unlock_rb_tree (#1)

 - #1's rafcount--;


> 
> > 
> > > Approach B: need add list_head *
> > >
> > > Or maybe it's not a problem because we can afford all these above.
> > >
> > > Anyway, I'm not against.
> > >
> > > >
> > > > 3. It needs to set a minimum length for the candidate of extent cache.
> > > >  (e.g., 64)
> > >
> > > Is this used for shrinker? Can you please explain more about this minimum length?
> > >
> > > >
> > > > So, for example,
> > > > struct ino_entry_for_extents {
> > > > 	inode number;
> > > > 	rb_tree for extent_entry objects;
> > > > 	rwlock;
> > > > };
> > > >
> > > > struct extent_entry {
> > > > 	blkaddr, len;
> > > > 	list_head *;
> > >
> > > We should add one other field 'inode number' here, as through it we can get
> > > rb-tree root from ino_entry_for_extents for rb_erase when we free the extent
> > > entry in shrinker.
> > >
> > > > };
> > > >
> > > > Something like this.
> > > >
> > > > [A, B, C, ... are extent entry]
> > > >
> > > > The sbi has
> > > > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> > > >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> > > >               ` ino_entry_for_extents (#12) has F    in rb-tree
> > > >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > > >
> > > > In f2fs_update_extent_cache and __get_data_block for #10,
> > > >   ino_entry_for_extents (#10) was founded and updated D or B.
> > > >   Then, updated entries are moved to MRU.
> > > >
> > > > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > > > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > > > should be released.
> > > >
> > > > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > > > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > > >
> > > > IMO, we don't need to consider readahead for this, since get_data_block will
> > > > be called by VFS readahead.
> > >
> > > In get_data_block we can cache only one blkaddr once after get_dnode_of_data
> > > returned, it seems less efficient. So why not readahead selectively more
> > > contiguous valid blkaddr in get_dnode_of_data once?
> > 
> > Why do you think only one blkaddr?
> > get_data_block searches extents as many as possible.
> 
> Surely not one blkaddr.
> 
> Actually, what I mean is that:
> ->get_dnode_of_data
>   ->f2fs_ra_extent_cache()
> 
> f2fs_ra_extent_cache(node_page, offset) {
> 	for (i = offset; i < max; i++) {
> 		if (is_valid() && is_contiguous())
> 			len++;
> 	}
> 	f2fs_update_extent_cache(inode, fofs, blk_addr, len)
> }
> 
> But not:
> ->__get_data_block
>   ->get_dnode_of_data
>   ->f2fs_update_extent_cache(inode, fofs, blk_addr, 1);
> 
> Thanks,
> Yu
> 
> > 
> > Thanks,
> > 
> > >
> > > >
> > > > Furthermore, we need to think about whether LRU is really best or not.
> > > > IMO, the extent cache aims to improve second access speed, rather than initial
> > > > cold misses. So, maybe MRU or another algorithms would be better.
> > >
> > > Agree, let's rethink about this. :)
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > 2) Limitation:
> > > > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > > > number will enlarge, so memory and CPU overhead will increase.
> > > > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > > > number to limit total extent number in each inode, This number is global
> > > > > configuration which is visable to all inode. This number will be exported to
> > > > > sysfs for configuring according to requirement of user. By default, designed
> > > > > number is 8.
> > > > >
> > > > > 3) Shrinker:
> > > > > There are two shrink paths:
> > > > > 	a) one is triggered when extent count has exceed the upper bound of
> > > > > 	inode's extent cache. We will try to release extent(s) from head of
> > > > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > > > 	This operation could be in f2fs_update_extent_cache().
> > > > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > > > 	get inode from head of global lru list(s), and release extent(s) with
> > > > > 	fixed number (by default: 64 extents) in inode one by one.
> > > > > 	This operation could be in f2fs_balance_fs_bg().
> > > > >
> > > > > 4) Revalidation:
> > > > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > > > it to inode when reopen.
> > > > > Besides, a global lru list is needed here for shrinker.
> > > > >
> > > > > 5) Readahead:
> > > > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > > > ->lock of global shrink list.
> > > > >

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
@ 2015-01-06 23:00                           ` Jaegeuk Kim
  0 siblings, 0 replies; 26+ messages in thread
From: Jaegeuk Kim @ 2015-01-06 23:00 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Sun, Jan 04, 2015 at 04:24:10PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Wednesday, December 31, 2014 4:26 PM
> > To: Chao Yu
> > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi Chao,
> > 
> > On Tue, Dec 30, 2014 at 06:10:21PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > > -----Original Message-----
> > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > Sent: Tuesday, December 30, 2014 5:23 AM
> > > > To: Chao Yu
> > > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > >
> > > > Hi Chao,
> > > >
> > > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > > >
> > > > [snip]
> > > >
> > > > Nice draft. :)
> > >
> > > Thanks! :)
> > >
> > > >
> > > > >
> > > > > Please see the draft below.
> > > > >
> > > > > 1) Extent management:
> > > > > If we use global management that managing all extents which are from different
> > > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > > gain.
> > > >
> > > > Agreed.
> > > >
> > > > > So we choose a local management for extent which means all extents are
> > > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > > all extents globally by linking all inode into a global lru list for extent
> > > > > cache shrinker.
> > > > > Approach:
> > > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > > 		*extent tree: link all extent in rb-tree;
> > > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > > 		*lru list: sort all extents in accessing time order;
> > > > > 		*extent count: record total count of extents in cache.
> > > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > > 		*inode will be added or repostioned in this global list whenever
> > > > > 		extent is being access in this inode.
> > > > > 		*use spinlock to protect this shrink list.
> > > >
> > > > 1. How about adding a data structure with inode number instead of referring
> > > > inode pointer?
> > > >
> > > > 2. How about managing extent entries globally and setting an upper bound to
> > > > the number of extent entries instead of limiting them per each inode?
> > > > (The rb-tree will handle many extents per inode.)
> > >
> > > [assumption]
> > > Approach A: global lru list;
> > > Approach B: lru list per inode.
> > >
> > > I have considered Approach A before writing the draft, although Approach A could
> > > provide us higher ratio hit due to global lru, it may make lock contention more
> > > intensively and has more memory overhead descripted below. So I choose more
> > > conservative Approach B.
> > > 1) as extent_entry will split in Cow, we may lock extent_list more times when
> > > move these separated extent entries in extent_list, unless we have batch mode
> > > interface.
> > > 2) lock overhead between shrinker and lookuper and updater.
> > > e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > (#1) has A, C, E, G in rb-tree
> > > (#2) has B, D, F in rb-tree
> > > shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
> > > 	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
> > > 	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
> > > 	...
> > 
> > lock_list -> list_del(A) -> unlock_list -> lock #1 -> remove_tree(A) -> unlock #1
> > -> free A.
> 
> If unlock_list before lock #1, (A) could be modified by other thread between ->unlock_list
> and lock #1, so one more "lookup_tree(A)" is needed under lock #1:
> lock #1 -> lookup_tree(A) -> remove_tree(A) -> unlock #1
> 
> > 
> > Or,
> > lock_list -> list_del(A, B, C) -> unlock_list -> lock #1 -> remove_tree(A, C) ->
> > unlock #1 -> free A, C -> lock #2 -> remove_tree(B) -> unlock #2
> 
> ditto.
> To avoid unneeded lookup_tree(A), maybe we can use this series:
> 
> lock list -> get A from head -> trylock #1 -> try to get more node(C, E, G) of #1 in list
> -> unlock list -> remove_tree&free(A, C, E, G) -> unlock #1
> 
> > 
> > I think a couple of list operations do not cause severe lock contention.
> 
> Yeah, batch operation is better.
> But sometimes random write break extent into everywhere in shrinker's list,
> so batch operation can gain less in this condition.
> 
> > And, if we assing minimum length for extent caches, the contention would be
> > distributed.
> > 
> > This means that, it needs to manage each of all the extents in the cache should
> > have the length larger than minimum value.
> 
> If I do not misunderstand, what you mean is that if extent number in one inode cache
> is bigger than minimum value, then, we will start to add all extents of this inode
> into shrinker's list, and reposition them in the list when f2fs_{lookup,update}_extent_cache?
> 
> > 
> > > *trylock/unlock* for all free extent entries belong to one inode will be separated
> > > to lots of parts, resulting in more contention.
> > > 3) it has more memory overhead in each extent entry:
> > > Approach A: need add inode number for backref and list_head *
> > 
> > It doesn't need to add ino. Just remain them, and shrinker can remove empty
> > ino_entry_for_extents periodically.
> 
> I do not understand why we don't need ino, :(.
> Without ino, how can we get the rb-tree root pointer for rb_erase(node, root)?

Let me describe in more detail.

For example,

- global radix tree in sbi
  --> ino #1 --> rb_tree (rb_tree, rb_tree_lock, refcount=0)
  --> ino #2 --> rb_tree
  --> ino #3 --> rb_tree
                    `-> extent A 
		    `-> extent B (fofs, blkaddr, length, *list_head=empty)

- global extent list in sbi

  extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
  (#1) has A, C, E, G in rb-tree
  (#2) has B, D, F in rb-tree

1) update extents (#1)

 - lock_radix_tree
     if (notfound #1)
         allocate #1;
     #1's refcount++;
 - unlock_radix_tree

 - lock_rb_tree (#1)
     update A
     rb_delete E, if E->length is smaller than MIN_LEN.
     allocate & rb_add K, if K->length is larget than MIN_LEN.

     list_lock
     list_move(A)
     list_del(E)
     list_add_tail(K)
     list_unlock
 - unlock_rb_tree (#1)

 - #1's rafcount--;

2) shrinker

 - list_lock
     list_del A, B, C, ...
 - list_unlock

 - gang_look_up_radix_tree_with_lock {

     #N's refcount++;

     lock_rb_tree (#N)
         for_each_rb_node {
             if (list_empty(extent->list_head)) {
                 remove_rb_node(extent);
                 free(extent);
             }
	 }
     unlock_rb_tree (#N)

     #N's refcount--;
 - }

 - for_each_radix_tree_with_lock {
     if (radix_node->refcount == 0 && rb_tree_is_empty)
         free(radix_node);
 - }

3) lookup extents (#1)

 - lock_radix_tree
     if (notfound #1)
         goto out;
     #1's refcount++;
 - unlock_radix_tree

 - lock_rb_tree (#1)
     found extent A

     list_lock
     list_move(A)
     list_unlock
 - unlock_rb_tree (#1)

 - #1's rafcount--;


> 
> > 
> > > Approach B: need add list_head *
> > >
> > > Or maybe it's not a problem because we can afford all these above.
> > >
> > > Anyway, I'm not against.
> > >
> > > >
> > > > 3. It needs to set a minimum length for the candidate of extent cache.
> > > >  (e.g., 64)
> > >
> > > Is this used for shrinker? Can you please explain more about this minimum length?
> > >
> > > >
> > > > So, for example,
> > > > struct ino_entry_for_extents {
> > > > 	inode number;
> > > > 	rb_tree for extent_entry objects;
> > > > 	rwlock;
> > > > };
> > > >
> > > > struct extent_entry {
> > > > 	blkaddr, len;
> > > > 	list_head *;
> > >
> > > We should add one other field 'inode number' here, as through it we can get
> > > rb-tree root from ino_entry_for_extents for rb_erase when we free the extent
> > > entry in shrinker.
> > >
> > > > };
> > > >
> > > > Something like this.
> > > >
> > > > [A, B, C, ... are extent entry]
> > > >
> > > > The sbi has
> > > > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> > > >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> > > >               ` ino_entry_for_extents (#12) has F    in rb-tree
> > > >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > > >
> > > > In f2fs_update_extent_cache and __get_data_block for #10,
> > > >   ino_entry_for_extents (#10) was founded and updated D or B.
> > > >   Then, updated entries are moved to MRU.
> > > >
> > > > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > > > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > > > should be released.
> > > >
> > > > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > > > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > > >
> > > > IMO, we don't need to consider readahead for this, since get_data_block will
> > > > be called by VFS readahead.
> > >
> > > In get_data_block we can cache only one blkaddr once after get_dnode_of_data
> > > returned, it seems less efficient. So why not readahead selectively more
> > > contiguous valid blkaddr in get_dnode_of_data once?
> > 
> > Why do you think only one blkaddr?
> > get_data_block searches extents as many as possible.
> 
> Surely not one blkaddr.
> 
> Actually, what I mean is that:
> ->get_dnode_of_data
>   ->f2fs_ra_extent_cache()
> 
> f2fs_ra_extent_cache(node_page, offset) {
> 	for (i = offset; i < max; i++) {
> 		if (is_valid() && is_contiguous())
> 			len++;
> 	}
> 	f2fs_update_extent_cache(inode, fofs, blk_addr, len)
> }
> 
> But not:
> ->__get_data_block
>   ->get_dnode_of_data
>   ->f2fs_update_extent_cache(inode, fofs, blk_addr, 1);
> 
> Thanks,
> Yu
> 
> > 
> > Thanks,
> > 
> > >
> > > >
> > > > Furthermore, we need to think about whether LRU is really best or not.
> > > > IMO, the extent cache aims to improve second access speed, rather than initial
> > > > cold misses. So, maybe MRU or another algorithms would be better.
> > >
> > > Agree, let's rethink about this. :)
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > 2) Limitation:
> > > > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > > > number will enlarge, so memory and CPU overhead will increase.
> > > > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > > > number to limit total extent number in each inode, This number is global
> > > > > configuration which is visable to all inode. This number will be exported to
> > > > > sysfs for configuring according to requirement of user. By default, designed
> > > > > number is 8.
> > > > >
> > > > > 3) Shrinker:
> > > > > There are two shrink paths:
> > > > > 	a) one is triggered when extent count has exceed the upper bound of
> > > > > 	inode's extent cache. We will try to release extent(s) from head of
> > > > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > > > 	This operation could be in f2fs_update_extent_cache().
> > > > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > > > 	get inode from head of global lru list(s), and release extent(s) with
> > > > > 	fixed number (by default: 64 extents) in inode one by one.
> > > > > 	This operation could be in f2fs_balance_fs_bg().
> > > > >
> > > > > 4) Revalidation:
> > > > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > > > it to inode when reopen.
> > > > > Besides, a global lru list is needed here for shrinker.
> > > > >
> > > > > 5) Readahead:
> > > > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > > > ->lock of global shrink list.
> > > > >

------------------------------------------------------------------------------
Dive into the World of Parallel Programming! The Go Parallel Website,
sponsored by Intel and developed in partnership with Slashdot Media, is your
hub for all things parallel software development, from weekly thought
leadership blogs to news, videos, case studies, tutorials and more. Take a
look and join the conversation now. http://goparallel.sourceforge.net

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2015-01-04  3:19                     ` Chao Yu
@ 2015-01-07 11:16                       ` Changman Lee
  2015-01-12  7:06                         ` Chao Yu
  0 siblings, 1 reply; 26+ messages in thread
From: Changman Lee @ 2015-01-07 11:16 UTC (permalink / raw)
  To: Chao Yu; +Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi Chao,

On Sun, Jan 04, 2015 at 11:19:28AM +0800, Chao Yu wrote:
> Hi Changman,
> 
> Sorry for replying late!
> 
> > -----Original Message-----
> > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > Sent: Tuesday, December 30, 2014 8:32 AM
> > To: Jaegeuk Kim
> > Cc: Chao Yu; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > 
> > Hi all,
> > 
> > On Mon, Dec 29, 2014 at 01:23:00PM -0800, Jaegeuk Kim wrote:
> > > Hi Chao,
> > >
> > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > >
> > > [snip]
> > >
> > > Nice draft. :)
> > >
> > > >
> > > > Please see the draft below.
> > > >
> > > > 1) Extent management:
> > > > If we use global management that managing all extents which are from different
> > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > gain.
> > >
> > > Agreed.
> > >
> > > > So we choose a local management for extent which means all extents are
> > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > all extents globally by linking all inode into a global lru list for extent
> > > > cache shrinker.
> > > > Approach:
> > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > 		*extent tree: link all extent in rb-tree;
> > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > 		*lru list: sort all extents in accessing time order;
> > > > 		*extent count: record total count of extents in cache.
> > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > 		*inode will be added or repostioned in this global list whenever
> > > > 		extent is being access in this inode.
> > > > 		*use spinlock to protect this shrink list.
> > >
> > > 1. How about adding a data structure with inode number instead of referring
> > > inode pointer?
> > >
> > > 2. How about managing extent entries globally and setting an upper bound to
> > > the number of extent entries instead of limiting them per each inode?
> > > (The rb-tree will handle many extents per inode.)
> > >
> > > 3. It needs to set a minimum length for the candidate of extent cache.
> > >  (e.g., 64)
> > >
> > 
> > Agreed.
> > 
> > > So, for example,
> > > struct ino_entry_for_extents {
> > > 	inode number;
> > > 	rb_tree for extent_entry objects;
> > > 	rwlock;
> > > };
> > >
> > > struct extent_entry {
> > > 	blkaddr, len;
> > > 	list_head *;
> > > };
> > >
> > > Something like this.
> > >
> > > [A, B, C, ... are extent entry]
> > >
> > > The sbi has
> > > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> > >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> > >               ` ino_entry_for_extents (#12) has F    in rb-tree
> > >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > >
> > > In f2fs_update_extent_cache and __get_data_block for #10,
> > >   ino_entry_for_extents (#10) was founded and updated D or B.
> > >   Then, updated entries are moved to MRU.
> > >
> > > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > > should be released.
> > >
> > > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > >
> > > IMO, we don't need to consider readahead for this, since get_data_block will
> > > be called by VFS readahead.
> > >
> > > Furthermore, we need to think about whether LRU is really best or not.
> > > IMO, the extent cache aims to improve second access speed, rather than initial
> > > cold misses. So, maybe MRU or another algorithms would be better.
> > >
> > 
> > Right. It's very comflicated to judge which is better.
> > In read or write path, extents could be made every time. At that time, we should
> > decide which extent evicts instead of new extents if we set upper bound.
> > In update, one extent could be seperated into 3. It requires 3 insertion and 1 deletion.
> > So if update happends frequently, we could give up extent management for some ranges.
> > And we need to bring ideas from vm managemnt. For example,
> > active/inactive list and second chance to promotion, or batch work for insertion/deletion
> > 
> > I thought suddenly 'Simple is best'.
> > Let's think about better ideas together.
> 
> Yeah, how about using an opposite way to the way of page cache manager?
> 
> for example:
> node page A,B,C,D is in page cache;
> extent a,b,c,d is in extent cache;
> extent a is built from page A, ..., d is built from page D.
> page cache: LRU A -> B -> C -> D MRU
> extent cache: LRU a -> b -> c -> d MRU
> 
> If we use
> 1) the same way LRU, cache pair A-a, B-b, ... may be reclaimed in the same time as OOM.
> 2) the opposite way, maybe A,B in page cache and d,c in extent cache will be reclaimed,
> but we still can hit whole cache in combination of page cache and extent cache.
> 
> So by the way '2)' we can increase the total coverage area of page cache + extent cache.

Good idea. :)
If we decide which one is better between global lru and per inode lru,
it's expected that f2fs read performance will be more improved.
I'll wait your patch.

Thanks,

> 
> > 
> > > Thanks,
> > >
> > > >
> > > > 2) Limitation:
> > > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > > number will enlarge, so memory and CPU overhead will increase.
> > > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > > number to limit total extent number in each inode, This number is global
> > > > configuration which is visable to all inode. This number will be exported to
> > > > sysfs for configuring according to requirement of user. By default, designed
> > > > number is 8.
> > > >
> > 
> > Chao,
> > It's better which # of extent are controlled globally rather than limit extents
> > per inode as Jaegeuk said to reduce extent management overhead.
> 
> It's OK.
> 
> > 
> > > > 3) Shrinker:
> > > > There are two shrink paths:
> > > > 	a) one is triggered when extent count has exceed the upper bound of
> > > > 	inode's extent cache. We will try to release extent(s) from head of
> > > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > > 	This operation could be in f2fs_update_extent_cache().
> > > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > > 	get inode from head of global lru list(s), and release extent(s) with
> > > > 	fixed number (by default: 64 extents) in inode one by one.
> > > > 	This operation could be in f2fs_balance_fs_bg().
> > > >
> > 
> > Let's consider to use register_shrinker which is called by vm when
> > memory pressure happens.
> 
> Great, thanks for your reminding! :-)
> 
> Thanks,
> Yu
> 
> > 
> > > > 4) Revalidation:
> > > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > > it to inode when reopen.
> > > > Besides, a global lru list is needed here for shrinker.
> > > >
> > > > 5) Readahead:
> > > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > > ->lock of global shrink list.
> > > >

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2015-01-06 23:00                           ` Jaegeuk Kim
  (?)
@ 2015-01-12  7:04                           ` Chao Yu
  -1 siblings, 0 replies; 26+ messages in thread
From: Chao Yu @ 2015-01-12  7:04 UTC (permalink / raw)
  To: 'Jaegeuk Kim'
  Cc: 'Changman Lee', linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Wednesday, January 07, 2015 7:01 AM
> To: Chao Yu
> Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Sun, Jan 04, 2015 at 04:24:10PM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Wednesday, December 31, 2014 4:26 PM
> > > To: Chao Yu
> > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi Chao,
> > >
> > > On Tue, Dec 30, 2014 at 06:10:21PM +0800, Chao Yu wrote:
> > > > Hi Jaegeuk,
> > > >
> > > > > -----Original Message-----
> > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > Sent: Tuesday, December 30, 2014 5:23 AM
> > > > > To: Chao Yu
> > > > > Cc: 'Changman Lee'; linux-f2fs-devel@lists.sourceforge.net;
> linux-kernel@vger.kernel.org
> > > > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > > > >
> > > > > Hi Chao,
> > > > >
> > > > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > > > >
> > > > > [snip]
> > > > >
> > > > > Nice draft. :)
> > > >
> > > > Thanks! :)
> > > >
> > > > >
> > > > > >
> > > > > > Please see the draft below.
> > > > > >
> > > > > > 1) Extent management:
> > > > > > If we use global management that managing all extents which are from different
> > > > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > > > gain.
> > > > >
> > > > > Agreed.
> > > > >
> > > > > > So we choose a local management for extent which means all extents are
> > > > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > > > all extents globally by linking all inode into a global lru list for extent
> > > > > > cache shrinker.
> > > > > > Approach:
> > > > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > > > 		*extent tree: link all extent in rb-tree;
> > > > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > > > 		*lru list: sort all extents in accessing time order;
> > > > > > 		*extent count: record total count of extents in cache.
> > > > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > > > 		*inode will be added or repostioned in this global list whenever
> > > > > > 		extent is being access in this inode.
> > > > > > 		*use spinlock to protect this shrink list.
> > > > >
> > > > > 1. How about adding a data structure with inode number instead of referring
> > > > > inode pointer?
> > > > >
> > > > > 2. How about managing extent entries globally and setting an upper bound to
> > > > > the number of extent entries instead of limiting them per each inode?
> > > > > (The rb-tree will handle many extents per inode.)
> > > >
> > > > [assumption]
> > > > Approach A: global lru list;
> > > > Approach B: lru list per inode.
> > > >
> > > > I have considered Approach A before writing the draft, although Approach A could
> > > > provide us higher ratio hit due to global lru, it may make lock contention more
> > > > intensively and has more memory overhead descripted below. So I choose more
> > > > conservative Approach B.
> > > > 1) as extent_entry will split in Cow, we may lock extent_list more times when
> > > > move these separated extent entries in extent_list, unless we have batch mode
> > > > interface.
> > > > 2) lock overhead between shrinker and lookuper and updater.
> > > > e.g. extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > > (#1) has A, C, E, G in rb-tree
> > > > (#2) has B, D, F in rb-tree
> > > > shrinker op: lock list -> trylock #1 -> unlock list -> free A -> unlock #1
> > > > 	lock list -> trylock #2 -> unlock list -> free B -> unlock #2
> > > > 	lock list -> trylock #1 -> unlock list -> free C -> unlock #1
> > > > 	...
> > >
> > > lock_list -> list_del(A) -> unlock_list -> lock #1 -> remove_tree(A) -> unlock #1
> > > -> free A.
> >
> > If unlock_list before lock #1, (A) could be modified by other thread between ->unlock_list
> > and lock #1, so one more "lookup_tree(A)" is needed under lock #1:
> > lock #1 -> lookup_tree(A) -> remove_tree(A) -> unlock #1
> >
> > >
> > > Or,
> > > lock_list -> list_del(A, B, C) -> unlock_list -> lock #1 -> remove_tree(A, C) ->
> > > unlock #1 -> free A, C -> lock #2 -> remove_tree(B) -> unlock #2
> >
> > ditto.
> > To avoid unneeded lookup_tree(A), maybe we can use this series:
> >
> > lock list -> get A from head -> trylock #1 -> try to get more node(C, E, G) of #1 in list
> > -> unlock list -> remove_tree&free(A, C, E, G) -> unlock #1
> >
> > >
> > > I think a couple of list operations do not cause severe lock contention.
> >
> > Yeah, batch operation is better.
> > But sometimes random write break extent into everywhere in shrinker's list,
> > so batch operation can gain less in this condition.
> >
> > > And, if we assing minimum length for extent caches, the contention would be
> > > distributed.
> > >
> > > This means that, it needs to manage each of all the extents in the cache should
> > > have the length larger than minimum value.
> >
> > If I do not misunderstand, what you mean is that if extent number in one inode cache
> > is bigger than minimum value, then, we will start to add all extents of this inode
> > into shrinker's list, and reposition them in the list when f2fs_{lookup,update}_extent_cache?
> >
> > >
> > > > *trylock/unlock* for all free extent entries belong to one inode will be separated
> > > > to lots of parts, resulting in more contention.
> > > > 3) it has more memory overhead in each extent entry:
> > > > Approach A: need add inode number for backref and list_head *
> > >
> > > It doesn't need to add ino. Just remain them, and shrinker can remove empty
> > > ino_entry_for_extents periodically.
> >
> > I do not understand why we don't need ino, :(.
> > Without ino, how can we get the rb-tree root pointer for rb_erase(node, root)?
> 
> Let me describe in more detail.
> 
> For example,
> 
> - global radix tree in sbi
>   --> ino #1 --> rb_tree (rb_tree, rb_tree_lock, refcount=0)
>   --> ino #2 --> rb_tree
>   --> ino #3 --> rb_tree
>                     `-> extent A
> 		    `-> extent B (fofs, blkaddr, length, *list_head=empty)
> 
> - global extent list in sbi
> 
>   extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
>   (#1) has A, C, E, G in rb-tree
>   (#2) has B, D, F in rb-tree
> 
> 1) update extents (#1)
> 
>  - lock_radix_tree
>      if (notfound #1)
>          allocate #1;
>      #1's refcount++;
>  - unlock_radix_tree
> 
>  - lock_rb_tree (#1)
>      update A
>      rb_delete E, if E->length is smaller than MIN_LEN.
>      allocate & rb_add K, if K->length is larget than MIN_LEN.
> 
>      list_lock
>      list_move(A)
>      list_del(E)
>      list_add_tail(K)
>      list_unlock
>  - unlock_rb_tree (#1)
> 
>  - #1's rafcount--;
> 
> 2) shrinker
> 
>  - list_lock
>      list_del A, B, C, ...
>  - list_unlock
> 
>  - gang_look_up_radix_tree_with_lock {
> 
>      #N's refcount++;
> 
>      lock_rb_tree (#N)
>          for_each_rb_node {
>              if (list_empty(extent->list_head)) {
>                  remove_rb_node(extent);
>                  free(extent);
>              }
> 	 }
>      unlock_rb_tree (#N)
> 
>      #N's refcount--;
>  - }
> 
>  - for_each_radix_tree_with_lock {
>      if (radix_node->refcount == 0 && rb_tree_is_empty)
>          free(radix_node);
>  - }
> 
> 3) lookup extents (#1)
> 
>  - lock_radix_tree
>      if (notfound #1)
>          goto out;
>      #1's refcount++;
>  - unlock_radix_tree
> 
>  - lock_rb_tree (#1)
>      found extent A
> 
>      list_lock
>      list_move(A)
>      list_unlock
>  - unlock_rb_tree (#1)
> 
>  - #1's rafcount--;

Really Good! Thanks for your suggestion and the work.

The following RFC patch set is written based on above detail design.

Please review and comment on it, thank you! :)

Thanks


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] f2fs: add extent cache base on rb-tree
  2015-01-07 11:16                       ` Changman Lee
@ 2015-01-12  7:06                         ` Chao Yu
  0 siblings, 0 replies; 26+ messages in thread
From: Chao Yu @ 2015-01-12  7:06 UTC (permalink / raw)
  To: 'Changman Lee'
  Cc: 'Jaegeuk Kim', linux-f2fs-devel, linux-kernel

Hi Changman,

> -----Original Message-----
> From: Changman Lee [mailto:cm224.lee@samsung.com]
> Sent: Wednesday, January 07, 2015 7:17 PM
> To: Chao Yu
> Cc: 'Jaegeuk Kim'; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> 
> Hi Chao,
> 
> On Sun, Jan 04, 2015 at 11:19:28AM +0800, Chao Yu wrote:
> > Hi Changman,
> >
> > Sorry for replying late!
> >
> > > -----Original Message-----
> > > From: Changman Lee [mailto:cm224.lee@samsung.com]
> > > Sent: Tuesday, December 30, 2014 8:32 AM
> > > To: Jaegeuk Kim
> > > Cc: Chao Yu; linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [RFC PATCH] f2fs: add extent cache base on rb-tree
> > >
> > > Hi all,
> > >
> > > On Mon, Dec 29, 2014 at 01:23:00PM -0800, Jaegeuk Kim wrote:
> > > > Hi Chao,
> > > >
> > > > On Mon, Dec 29, 2014 at 03:19:18PM +0800, Chao Yu wrote:
> > > >
> > > > [snip]
> > > >
> > > > Nice draft. :)
> > > >
> > > > >
> > > > > Please see the draft below.
> > > > >
> > > > > 1) Extent management:
> > > > > If we use global management that managing all extents which are from different
> > > > > inodes in sbi, we will face with serious lock contention when we access these
> > > > > extents belong to different inodes concurrently, the loss may outweights the
> > > > > gain.
> > > >
> > > > Agreed.
> > > >
> > > > > So we choose a local management for extent which means all extents are
> > > > > managed by inode itself to avoid above lock contention. Addtionlly, we manage
> > > > > all extents globally by linking all inode into a global lru list for extent
> > > > > cache shrinker.
> > > > > Approach:
> > > > > 	a) build extent tree/rwlock/lru list/extent count in each inode.
> > > > > 		*extent tree: link all extent in rb-tree;
> > > > > 		*rwlock: protect fields when accessing extent cache concurrently;
> > > > > 		*lru list: sort all extents in accessing time order;
> > > > > 		*extent count: record total count of extents in cache.
> > > > > 	b) use lru shrink list in sbi to manage all inode which cached extents.
> > > > > 		*inode will be added or repostioned in this global list whenever
> > > > > 		extent is being access in this inode.
> > > > > 		*use spinlock to protect this shrink list.
> > > >
> > > > 1. How about adding a data structure with inode number instead of referring
> > > > inode pointer?
> > > >
> > > > 2. How about managing extent entries globally and setting an upper bound to
> > > > the number of extent entries instead of limiting them per each inode?
> > > > (The rb-tree will handle many extents per inode.)
> > > >
> > > > 3. It needs to set a minimum length for the candidate of extent cache.
> > > >  (e.g., 64)
> > > >
> > >
> > > Agreed.
> > >
> > > > So, for example,
> > > > struct ino_entry_for_extents {
> > > > 	inode number;
> > > > 	rb_tree for extent_entry objects;
> > > > 	rwlock;
> > > > };
> > > >
> > > > struct extent_entry {
> > > > 	blkaddr, len;
> > > > 	list_head *;
> > > > };
> > > >
> > > > Something like this.
> > > >
> > > > [A, B, C, ... are extent entry]
> > > >
> > > > The sbi has
> > > > 1. an extent_list: (LRU) A -> B -> C -> D -> E -> F -> G (MRU)
> > > > 2. radix_tree:  ino_entry_for_extents (#10) has D, B in rb-tree
> > > >               ` ino_entry_for_extents (#11) has A, C in rb-tree
> > > >               ` ino_entry_for_extents (#12) has F    in rb-tree
> > > >               ` ino_entry_for_extents (#13) has G, E in rb-tree
> > > >
> > > > In f2fs_update_extent_cache and __get_data_block for #10,
> > > >   ino_entry_for_extents (#10) was founded and updated D or B.
> > > >   Then, updated entries are moved to MRU.
> > > >
> > > > In f2fs_evict_inode for #11, A and C are moved to LRU.
> > > > But, if this inode is unlinked, all the A, C, and ino_entry_for_extens (#11)
> > > > should be released.
> > > >
> > > > In f2fs_balance_fs_bg, some LRU extents are released according to the amount
> > > > of consumed memory. Then, it frees any ino_entry_for_extents having no extent.
> > > >
> > > > IMO, we don't need to consider readahead for this, since get_data_block will
> > > > be called by VFS readahead.
> > > >
> > > > Furthermore, we need to think about whether LRU is really best or not.
> > > > IMO, the extent cache aims to improve second access speed, rather than initial
> > > > cold misses. So, maybe MRU or another algorithms would be better.
> > > >
> > >
> > > Right. It's very comflicated to judge which is better.
> > > In read or write path, extents could be made every time. At that time, we should
> > > decide which extent evicts instead of new extents if we set upper bound.
> > > In update, one extent could be seperated into 3. It requires 3 insertion and 1 deletion.
> > > So if update happends frequently, we could give up extent management for some ranges.
> > > And we need to bring ideas from vm managemnt. For example,
> > > active/inactive list and second chance to promotion, or batch work for insertion/deletion
> > >
> > > I thought suddenly 'Simple is best'.
> > > Let's think about better ideas together.
> >
> > Yeah, how about using an opposite way to the way of page cache manager?
> >
> > for example:
> > node page A,B,C,D is in page cache;
> > extent a,b,c,d is in extent cache;
> > extent a is built from page A, ..., d is built from page D.
> > page cache: LRU A -> B -> C -> D MRU
> > extent cache: LRU a -> b -> c -> d MRU
> >
> > If we use
> > 1) the same way LRU, cache pair A-a, B-b, ... may be reclaimed in the same time as OOM.
> > 2) the opposite way, maybe A,B in page cache and d,c in extent cache will be reclaimed,
> > but we still can hit whole cache in combination of page cache and extent cache.
> >
> > So by the way '2)' we can increase the total coverage area of page cache + extent cache.
> 
> Good idea. :)
> If we decide which one is better between global lru and per inode lru,
> it's expected that f2fs read performance will be more improved.
> I'll wait your patch.

Please review and comment on following RFC patch set, thank you! :)

Thanks,

> 
> Thanks,
> 
> >
> > >
> > > > Thanks,
> > > >
> > > > >
> > > > > 2) Limitation:
> > > > > In one inode, as we split or add extent in extent cache when read/write, extent
> > > > > number will enlarge, so memory and CPU overhead will increase.
> > > > > In order to control the overhead of memory and CPU, we try to set a upper bound
> > > > > number to limit total extent number in each inode, This number is global
> > > > > configuration which is visable to all inode. This number will be exported to
> > > > > sysfs for configuring according to requirement of user. By default, designed
> > > > > number is 8.
> > > > >
> > >
> > > Chao,
> > > It's better which # of extent are controlled globally rather than limit extents
> > > per inode as Jaegeuk said to reduce extent management overhead.
> >
> > It's OK.
> >
> > >
> > > > > 3) Shrinker:
> > > > > There are two shrink paths:
> > > > > 	a) one is triggered when extent count has exceed the upper bound of
> > > > > 	inode's extent cache. We will try to release extent(s) from head of
> > > > > 	inode's inner extent lru list until extent count is equal to upper bound.
> > > > > 	This operation could be in f2fs_update_extent_cache().
> > > > > 	b) the other one is triggered when memory util exceed threshold, we try
> > > > > 	get inode from head of global lru list(s), and release extent(s) with
> > > > > 	fixed number (by default: 64 extents) in inode one by one.
> > > > > 	This operation could be in f2fs_balance_fs_bg().
> > > > >
> > >
> > > Let's consider to use register_shrinker which is called by vm when
> > > memory pressure happens.
> >
> > Great, thanks for your reminding! :-)
> >
> > Thanks,
> > Yu
> >
> > >
> > > > > 4) Revalidation:
> > > > > In ->evict(), extent cache will be released, in order to reuse extent cache
> > > > > of inode when reopen for high hit ratio, a proper way is to add cached extent
> > > > > tree into a hash table (could be radix tree), revalidate extent tree and recover
> > > > > it to inode when reopen.
> > > > > Besides, a global lru list is needed here for shrinker.
> > > > >
> > > > > 5) Readahead:
> > > > > Expand extent cache by readaheading when we call get_dnode_of_data in non-emergency
> > > > > path. Note, ra can affect lock contention for both ->rwlock in extent cache and
> > > > > ->lock of global shrink list.
> > > > >


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-01-12  7:07 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-19 10:49 [RFC PATCH] f2fs: add extent cache base on rb-tree Chao Yu
2014-12-22  2:03 ` Changman Lee
2014-12-22  7:10   ` Chao Yu
2014-12-22 23:16     ` Jaegeuk Kim
2014-12-23  3:01       ` Chao Yu
2014-12-23  7:36         ` Jaegeuk Kim
2014-12-23  8:45           ` Changman Lee
2014-12-24  8:01           ` Chao Yu
2014-12-25  8:35             ` Jaegeuk Kim
2014-12-29  7:19               ` Chao Yu
2014-12-29 21:23                 ` Jaegeuk Kim
2014-12-30  0:32                   ` Changman Lee
2015-01-04  3:19                     ` Chao Yu
2015-01-07 11:16                       ` Changman Lee
2015-01-12  7:06                         ` Chao Yu
2014-12-30 10:10                   ` Chao Yu
2014-12-31  8:26                     ` Jaegeuk Kim
2015-01-04  8:24                       ` Chao Yu
2015-01-06 23:00                         ` Jaegeuk Kim
2015-01-06 23:00                           ` Jaegeuk Kim
2015-01-12  7:04                           ` Chao Yu
2014-12-23  4:30     ` Changman Lee
2014-12-23  9:35       ` Chao Yu
2014-12-23 19:24         ` Jaegeuk Kim
2014-12-23 19:24           ` Jaegeuk Kim
2014-12-22  9:06   ` Chao Yu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.