linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
@ 2018-07-08 14:04 Chao Yu
  2018-07-08 14:04 ` [PATCH v3 2/2] f2fs: let checkpoint flush dnode page of regular Chao Yu
  2018-07-14  5:44 ` [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Jaegeuk Kim
  0 siblings, 2 replies; 8+ messages in thread
From: Chao Yu @ 2018-07-08 14:04 UTC (permalink / raw)
  To: jaegeuk; +Cc: linux-f2fs-devel, linux-kernel, Chao Yu

From: Chao Yu <yuchao0@huawei.com>

f2fs recovery flow is relying on dnode block link list, it means fsynced
file recovery depends on previous dnode's persistence in the list, so
during fsync() we should wait on all regular inode's dnode writebacked
before issuing flush.

By this way, we can avoid dnode block list being broken by out-of-order
IO submission due to IO scheduler or driver.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
---
v3:
- add a list to link all writebacked dnodes, let fsync() only wait on
necessary dnode.
 fs/f2fs/checkpoint.c |   2 +
 fs/f2fs/data.c       |   2 +
 fs/f2fs/f2fs.h       |  21 +++++++-
 fs/f2fs/file.c       |  20 +++----
 fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
 fs/f2fs/super.c      |   4 ++
 6 files changed, 153 insertions(+), 44 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8b698bd54490..d5e60d76362e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	f2fs_release_ino_entry(sbi, false);
 
+	f2fs_reset_fsync_node_info(sbi);
+
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 70813a4dda3e..afe76d87575c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
 					page->index != nid_of_node(page));
 
 		dec_page_count(sbi, WB_DATA_TYPE(page));
+		if (f2fs_in_warm_node_list(sbi, page))
+			f2fs_del_fsync_node_entry(sbi, page);
 		clear_cold_data(page);
 		end_page_writeback(page);
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c8c865fa8450..bf5f7a336ace 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -228,6 +228,12 @@ struct inode_entry {
 	struct inode *inode;	/* vfs inode pointer */
 };
 
+struct fsync_node_entry {
+	struct list_head list;	/* list head */
+	struct page *page;	/* warm node page pointer */
+	unsigned int seq_id;	/* sequence id */
+};
+
 /* for the bitmap indicate blocks to be discarded */
 struct discard_entry {
 	struct list_head list;	/* list head */
@@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
 
 	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
 
+	spinlock_t fsync_node_lock;		/* for node entry lock */
+	struct list_head fsync_node_list;	/* node list head */
+	unsigned int fsync_seg_id;		/* sequence id */
+	unsigned int fsync_node_num;		/* number of node entries */
+
 	/* for orphan inode, use 0'th array */
 	unsigned int max_orphans;		/* max orphan inodes */
 
@@ -2816,6 +2827,10 @@ struct node_info;
 
 int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
 bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
 int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
 bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
 bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
@@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
 int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
 int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
 int f2fs_truncate_xattr_node(struct inode *inode);
-int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
+int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
+					unsigned int seq_id);
 int f2fs_remove_inode_page(struct inode *inode);
 struct page *f2fs_new_inode_page(struct inode *inode);
 struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
@@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
 struct page *f2fs_get_node_page_ra(struct page *parent, int start);
 void f2fs_move_node_page(struct page *node_page, int gc_type);
 int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
-			struct writeback_control *wbc, bool atomic);
+			struct writeback_control *wbc, bool atomic,
+			unsigned int *seq_id);
 int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
 			struct writeback_control *wbc,
 			bool do_balance, enum iostat_type io_type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5e29d4053748..ddea2bfd4042 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 		.nr_to_write = LONG_MAX,
 		.for_reclaim = 0,
 	};
+	unsigned int seq_id = 0;
 
 	if (unlikely(f2fs_readonly(inode->i_sb)))
 		return 0;
@@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	}
 sync_nodes:
 	atomic_inc(&sbi->wb_sync_req[NODE]);
-	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
+	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
 	atomic_dec(&sbi->wb_sync_req[NODE]);
 	if (ret)
 		goto out;
@@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 		goto sync_nodes;
 	}
 
-	/*
-	 * If it's atomic_write, it's just fine to keep write ordering. So
-	 * here we don't need to wait for node write completion, since we use
-	 * node chain which serializes node blocks. If one of node writes are
-	 * reordered, we can see simply broken chain, resulting in stopping
-	 * roll-forward recovery. It means we'll recover all or none node blocks
-	 * given fsync mark.
-	 */
-	if (!atomic) {
-		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
-		if (ret)
-			goto out;
-	}
+
+	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
+	if (ret)
+		goto out;
 
 	/* once recovery info is written, don't need to tack this */
 	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 1e30bc305243..31dc372c56a0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -28,6 +28,7 @@
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
 static struct kmem_cache *nat_entry_set_slab;
+static struct kmem_cache *fsync_node_entry_slab;
 
 /*
  * Check whether the given nid is within node id range.
@@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
 							start, nr);
 }
 
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
+{
+	return NODE_MAPPING(sbi) == page->mapping &&
+			IS_DNODE(page) && is_cold_node(page);
+}
+
+void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+	spin_lock_init(&sbi->fsync_node_lock);
+	INIT_LIST_HEAD(&sbi->fsync_node_list);
+	sbi->fsync_seg_id = 0;
+	sbi->fsync_node_num = 0;
+}
+
+static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
+							struct page *page)
+{
+	struct fsync_node_entry *fn;
+	unsigned long flags;
+	unsigned int seq_id;
+
+	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
+
+	get_page(page);
+	fn->page = page;
+	INIT_LIST_HEAD(&fn->list);
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	list_add_tail(&fn->list, &sbi->fsync_node_list);
+	fn->seq_id = sbi->fsync_seg_id++;
+	seq_id = fn->seq_id;
+	sbi->fsync_node_num++;
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+
+	return seq_id;
+}
+
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct fsync_node_entry *fn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
+		if (fn->page == page) {
+			list_del(&fn->list);
+			sbi->fsync_node_num--;
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			kmem_cache_free(fsync_node_entry_slab, fn);
+			put_page(page);
+			return;
+		}
+	}
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+	f2fs_bug_on(sbi, 1);
+}
+
+void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	sbi->fsync_node_num = 0;
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+}
+
 int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 
 static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 				struct writeback_control *wbc, bool do_balance,
-				enum iostat_type io_type)
+				enum iostat_type io_type, unsigned int *seq_id)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	nid_t nid;
@@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 		.io_type = io_type,
 		.io_wbc = wbc,
 	};
+	unsigned int seq;
 
 	trace_f2fs_writepage(page, NODE);
 
@@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 
 	set_page_writeback(page);
 	ClearPageError(page);
+
+	if (f2fs_in_warm_node_list(sbi, page)) {
+		seq = f2fs_add_fsync_node_entry(sbi, page);
+		if (seq_id)
+			*seq_id = seq;
+	}
+
 	fio.old_blkaddr = ni.blk_addr;
 	f2fs_do_write_node_page(nid, &fio);
 	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
 			goto out_page;
 
 		if (__write_node_page(node_page, false, NULL,
-					&wbc, false, FS_GC_NODE_IO))
+					&wbc, false, FS_GC_NODE_IO, NULL))
 			unlock_page(node_page);
 		goto release_page;
 	} else {
@@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
-	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
+	return __write_node_page(page, false, NULL, wbc, false,
+						FS_NODE_IO, NULL);
 }
 
 int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
-			struct writeback_control *wbc, bool atomic)
+			struct writeback_control *wbc, bool atomic,
+			unsigned int *seq_id)
 {
 	pgoff_t index;
 	pgoff_t last_idx = ULONG_MAX;
@@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			ret = __write_node_page(page, atomic &&
 						page == last_page,
 						&submitted, wbc, true,
-						FS_NODE_IO);
+						FS_NODE_IO, seq_id);
 			if (ret) {
 				unlock_page(page);
 				f2fs_put_page(last_page, 0);
@@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
 			set_dentry_mark(page, 0);
 
 			ret = __write_node_page(page, false, &submitted,
-						wbc, do_balance, io_type);
+						wbc, do_balance, io_type, NULL);
 			if (ret)
 				unlock_page(page);
 			else if (submitted)
@@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
 	return ret;
 }
 
-int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
+						unsigned int seq_id)
 {
-	pgoff_t index = 0;
-	struct pagevec pvec;
-	int ret2, ret = 0;
-	int nr_pages;
+	struct fsync_node_entry *fn;
+	struct page *page;
+	struct list_head *head = &sbi->fsync_node_list;
+	unsigned long flags;
+	unsigned int cur_seq_id = 0;
+	int ret = 0;
 
-	pagevec_init(&pvec);
+	while (seq_id && cur_seq_id < seq_id) {
+		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+		if (list_empty(head)) {
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			break;
+		}
+		fn = list_first_entry(head, struct fsync_node_entry, list);
+		if (fn->seq_id > seq_id) {
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			break;
+		}
+		cur_seq_id = fn->seq_id;
+		page = fn->page;
+		get_page(page);
+		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 
-	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_WRITEBACK))) {
-		int i;
+		f2fs_wait_on_page_writeback(page, NODE, true);
+		if (TestClearPageError(page))
+			ret = -EIO;
 
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
+		put_page(page);
 
-			if (ino && ino_of_node(page) == ino) {
-				f2fs_wait_on_page_writeback(page, NODE, true);
-				if (TestClearPageError(page))
-					ret = -EIO;
-			}
-		}
-		pagevec_release(&pvec);
-		cond_resched();
+		if (ret)
+			break;
 	}
 
-	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
-	if (!ret)
-		ret = ret2;
 	return ret;
 }
 
@@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
 			sizeof(struct nat_entry_set));
 	if (!nat_entry_set_slab)
 		goto destroy_free_nid;
+
+	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+			sizeof(struct fsync_node_entry));
+	if (!fsync_node_entry_slab)
+		goto destroy_nat_entry_set;
 	return 0;
 
+destroy_nat_entry_set:
+	kmem_cache_destroy(nat_entry_set_slab);
 destroy_free_nid:
 	kmem_cache_destroy(free_nid_slab);
 destroy_nat_entry:
@@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
 
 void f2fs_destroy_node_manager_caches(void)
 {
+	kmem_cache_destroy(fsync_node_entry_slab);
 	kmem_cache_destroy(nat_entry_set_slab);
 	kmem_cache_destroy(free_nid_slab);
 	kmem_cache_destroy(nat_entry_slab);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 143ed321076e..34321932754d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
 	 */
 	f2fs_release_ino_entry(sbi, true);
 
+	f2fs_bug_on(sbi, sbi->fsync_node_num);
+
 	f2fs_leave_shrinker(sbi);
 	mutex_unlock(&sbi->umount_mutex);
 
@@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 
 	f2fs_init_ino_entry_info(sbi);
 
+	f2fs_init_fsync_node_info(sbi);
+
 	/* setup f2fs internal modules */
 	err = f2fs_build_segment_manager(sbi);
 	if (err) {
-- 
2.16.2.17.g38e79b1fd


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 2/2] f2fs: let checkpoint flush dnode page of regular
  2018-07-08 14:04 [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Chao Yu
@ 2018-07-08 14:04 ` Chao Yu
  2018-07-14  5:44 ` [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Jaegeuk Kim
  1 sibling, 0 replies; 8+ messages in thread
From: Chao Yu @ 2018-07-08 14:04 UTC (permalink / raw)
  To: jaegeuk; +Cc: linux-f2fs-devel, linux-kernel, Chao Yu

From: Chao Yu <yuchao0@huawei.com>

Fsyncer will wait on all dnode pages of regular writeback before flushing,
if there are async dnode pages blocked by IO scheduler, it may decrease
fsync's performance.

In this patch, we choose to let f2fs_balance_fs_bg() to trigger checkpoint
to flush these dnode pages of regular, so async IO of dnode page can be
elimitnated, making fsyncer only need to wait for sync IO.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
---
v3:
- rebase code.
 fs/f2fs/node.c    | 8 +++++++-
 fs/f2fs/node.h    | 5 +++++
 fs/f2fs/segment.c | 4 +++-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 31dc372c56a0..c48e2a2e5e82 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1453,6 +1453,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
+	if (wbc->sync_mode == WB_SYNC_NONE &&
+			IS_DNODE(page) && is_cold_node(page))
+		goto redirty_out;
+
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
 	f2fs_bug_on(sbi, page->index != nid);
@@ -1778,10 +1782,12 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
 	}
 
 	if (step < 2) {
+		if (wbc->sync_mode == WB_SYNC_NONE && step == 1)
+			goto out;
 		step++;
 		goto next_step;
 	}
-
+out:
 	if (nwritten)
 		f2fs_submit_merged_write(sbi, NODE);
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index b95e49e4a928..b0da4c26eebb 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -135,6 +135,11 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
 	return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
 }
 
+static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
+{
+	return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
+}
+
 enum mem_type {
 	FREE_NIDS,	/* indicates the free nid list */
 	NAT_ENTRIES,	/* indicates the cached nat entry */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 47b6595a078c..99beaf0a2dea 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -503,7 +503,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	else
 		f2fs_build_free_nids(sbi, false, false);
 
-	if (!is_idle(sbi) && !excess_dirty_nats(sbi))
+	if (!is_idle(sbi) &&
+		(!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi)))
 		return;
 
 	/* checkpoint is the only way to shrink partial cached entries */
@@ -511,6 +512,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 			!f2fs_available_free_memory(sbi, INO_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
 			excess_dirty_nats(sbi) ||
+			excess_dirty_nodes(sbi) ||
 			f2fs_time_over(sbi, CP_TIME)) {
 		if (test_opt(sbi, DATA_FLUSH)) {
 			struct blk_plug plug;
-- 
2.16.2.17.g38e79b1fd


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-08 14:04 [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Chao Yu
  2018-07-08 14:04 ` [PATCH v3 2/2] f2fs: let checkpoint flush dnode page of regular Chao Yu
@ 2018-07-14  5:44 ` Jaegeuk Kim
  2018-07-15  1:30   ` Chao Yu
  1 sibling, 1 reply; 8+ messages in thread
From: Jaegeuk Kim @ 2018-07-14  5:44 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel, Chao Yu

On 07/08, Chao Yu wrote:
> From: Chao Yu <yuchao0@huawei.com>
> 
> f2fs recovery flow is relying on dnode block link list, it means fsynced
> file recovery depends on previous dnode's persistence in the list, so
> during fsync() we should wait on all regular inode's dnode writebacked
> before issuing flush.
> 
> By this way, we can avoid dnode block list being broken by out-of-order
> IO submission due to IO scheduler or driver.

Hi Chao,

Just in case, can we measure some performance numbers with this?

Thanks,

> 
> Signed-off-by: Chao Yu <yuchao0@huawei.com>
> ---
> v3:
> - add a list to link all writebacked dnodes, let fsync() only wait on
> necessary dnode.
>  fs/f2fs/checkpoint.c |   2 +
>  fs/f2fs/data.c       |   2 +
>  fs/f2fs/f2fs.h       |  21 +++++++-
>  fs/f2fs/file.c       |  20 +++----
>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
>  fs/f2fs/super.c      |   4 ++
>  6 files changed, 153 insertions(+), 44 deletions(-)
> 
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 8b698bd54490..d5e60d76362e 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  
>  	f2fs_release_ino_entry(sbi, false);
>  
> +	f2fs_reset_fsync_node_info(sbi);
> +
>  	if (unlikely(f2fs_cp_error(sbi)))
>  		return -EIO;
>  
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 70813a4dda3e..afe76d87575c 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
>  					page->index != nid_of_node(page));
>  
>  		dec_page_count(sbi, WB_DATA_TYPE(page));
> +		if (f2fs_in_warm_node_list(sbi, page))
> +			f2fs_del_fsync_node_entry(sbi, page);
>  		clear_cold_data(page);
>  		end_page_writeback(page);
>  	}
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c8c865fa8450..bf5f7a336ace 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -228,6 +228,12 @@ struct inode_entry {
>  	struct inode *inode;	/* vfs inode pointer */
>  };
>  
> +struct fsync_node_entry {
> +	struct list_head list;	/* list head */
> +	struct page *page;	/* warm node page pointer */
> +	unsigned int seq_id;	/* sequence id */
> +};
> +
>  /* for the bitmap indicate blocks to be discarded */
>  struct discard_entry {
>  	struct list_head list;	/* list head */
> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
>  
>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
>  
> +	spinlock_t fsync_node_lock;		/* for node entry lock */
> +	struct list_head fsync_node_list;	/* node list head */
> +	unsigned int fsync_seg_id;		/* sequence id */
> +	unsigned int fsync_node_num;		/* number of node entries */
> +
>  	/* for orphan inode, use 0'th array */
>  	unsigned int max_orphans;		/* max orphan inodes */
>  
> @@ -2816,6 +2827,10 @@ struct node_info;
>  
>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
>  int f2fs_truncate_xattr_node(struct inode *inode);
> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
> +					unsigned int seq_id);
>  int f2fs_remove_inode_page(struct inode *inode);
>  struct page *f2fs_new_inode_page(struct inode *inode);
>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
>  void f2fs_move_node_page(struct page *node_page, int gc_type);
>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> -			struct writeback_control *wbc, bool atomic);
> +			struct writeback_control *wbc, bool atomic,
> +			unsigned int *seq_id);
>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>  			struct writeback_control *wbc,
>  			bool do_balance, enum iostat_type io_type);
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 5e29d4053748..ddea2bfd4042 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>  		.nr_to_write = LONG_MAX,
>  		.for_reclaim = 0,
>  	};
> +	unsigned int seq_id = 0;
>  
>  	if (unlikely(f2fs_readonly(inode->i_sb)))
>  		return 0;
> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>  	}
>  sync_nodes:
>  	atomic_inc(&sbi->wb_sync_req[NODE]);
> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
>  	atomic_dec(&sbi->wb_sync_req[NODE]);
>  	if (ret)
>  		goto out;
> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>  		goto sync_nodes;
>  	}
>  
> -	/*
> -	 * If it's atomic_write, it's just fine to keep write ordering. So
> -	 * here we don't need to wait for node write completion, since we use
> -	 * node chain which serializes node blocks. If one of node writes are
> -	 * reordered, we can see simply broken chain, resulting in stopping
> -	 * roll-forward recovery. It means we'll recover all or none node blocks
> -	 * given fsync mark.
> -	 */
> -	if (!atomic) {
> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
> -		if (ret)
> -			goto out;
> -	}
> +
> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
> +	if (ret)
> +		goto out;
>  
>  	/* once recovery info is written, don't need to tack this */
>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 1e30bc305243..31dc372c56a0 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -28,6 +28,7 @@
>  static struct kmem_cache *nat_entry_slab;
>  static struct kmem_cache *free_nid_slab;
>  static struct kmem_cache *nat_entry_set_slab;
> +static struct kmem_cache *fsync_node_entry_slab;
>  
>  /*
>   * Check whether the given nid is within node id range.
> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
>  							start, nr);
>  }
>  
> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
> +{
> +	return NODE_MAPPING(sbi) == page->mapping &&
> +			IS_DNODE(page) && is_cold_node(page);
> +}
> +
> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
> +{
> +	spin_lock_init(&sbi->fsync_node_lock);
> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
> +	sbi->fsync_seg_id = 0;
> +	sbi->fsync_node_num = 0;
> +}
> +
> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
> +							struct page *page)
> +{
> +	struct fsync_node_entry *fn;
> +	unsigned long flags;
> +	unsigned int seq_id;
> +
> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
> +
> +	get_page(page);
> +	fn->page = page;
> +	INIT_LIST_HEAD(&fn->list);
> +
> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
> +	fn->seq_id = sbi->fsync_seg_id++;
> +	seq_id = fn->seq_id;
> +	sbi->fsync_node_num++;
> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +
> +	return seq_id;
> +}
> +
> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
> +{
> +	struct fsync_node_entry *fn;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
> +		if (fn->page == page) {
> +			list_del(&fn->list);
> +			sbi->fsync_node_num--;
> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +			kmem_cache_free(fsync_node_entry_slab, fn);
> +			put_page(page);
> +			return;
> +		}
> +	}
> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +	f2fs_bug_on(sbi, 1);
> +}
> +
> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> +	sbi->fsync_node_num = 0;
> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +}
> +
>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
>  {
>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
>  
>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>  				struct writeback_control *wbc, bool do_balance,
> -				enum iostat_type io_type)
> +				enum iostat_type io_type, unsigned int *seq_id)
>  {
>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
>  	nid_t nid;
> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>  		.io_type = io_type,
>  		.io_wbc = wbc,
>  	};
> +	unsigned int seq;
>  
>  	trace_f2fs_writepage(page, NODE);
>  
> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>  
>  	set_page_writeback(page);
>  	ClearPageError(page);
> +
> +	if (f2fs_in_warm_node_list(sbi, page)) {
> +		seq = f2fs_add_fsync_node_entry(sbi, page);
> +		if (seq_id)
> +			*seq_id = seq;
> +	}
> +
>  	fio.old_blkaddr = ni.blk_addr;
>  	f2fs_do_write_node_page(nid, &fio);
>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>  			goto out_page;
>  
>  		if (__write_node_page(node_page, false, NULL,
> -					&wbc, false, FS_GC_NODE_IO))
> +					&wbc, false, FS_GC_NODE_IO, NULL))
>  			unlock_page(node_page);
>  		goto release_page;
>  	} else {
> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>  static int f2fs_write_node_page(struct page *page,
>  				struct writeback_control *wbc)
>  {
> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
> +	return __write_node_page(page, false, NULL, wbc, false,
> +						FS_NODE_IO, NULL);
>  }
>  
>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> -			struct writeback_control *wbc, bool atomic)
> +			struct writeback_control *wbc, bool atomic,
> +			unsigned int *seq_id)
>  {
>  	pgoff_t index;
>  	pgoff_t last_idx = ULONG_MAX;
> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>  			ret = __write_node_page(page, atomic &&
>  						page == last_page,
>  						&submitted, wbc, true,
> -						FS_NODE_IO);
> +						FS_NODE_IO, seq_id);
>  			if (ret) {
>  				unlock_page(page);
>  				f2fs_put_page(last_page, 0);
> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>  			set_dentry_mark(page, 0);
>  
>  			ret = __write_node_page(page, false, &submitted,
> -						wbc, do_balance, io_type);
> +						wbc, do_balance, io_type, NULL);
>  			if (ret)
>  				unlock_page(page);
>  			else if (submitted)
> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>  	return ret;
>  }
>  
> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
> +						unsigned int seq_id)
>  {
> -	pgoff_t index = 0;
> -	struct pagevec pvec;
> -	int ret2, ret = 0;
> -	int nr_pages;
> +	struct fsync_node_entry *fn;
> +	struct page *page;
> +	struct list_head *head = &sbi->fsync_node_list;
> +	unsigned long flags;
> +	unsigned int cur_seq_id = 0;
> +	int ret = 0;
>  
> -	pagevec_init(&pvec);
> +	while (seq_id && cur_seq_id < seq_id) {
> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> +		if (list_empty(head)) {
> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +			break;
> +		}
> +		fn = list_first_entry(head, struct fsync_node_entry, list);
> +		if (fn->seq_id > seq_id) {
> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> +			break;
> +		}
> +		cur_seq_id = fn->seq_id;
> +		page = fn->page;
> +		get_page(page);
> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>  
> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
> -				PAGECACHE_TAG_WRITEBACK))) {
> -		int i;
> +		f2fs_wait_on_page_writeback(page, NODE, true);
> +		if (TestClearPageError(page))
> +			ret = -EIO;
>  
> -		for (i = 0; i < nr_pages; i++) {
> -			struct page *page = pvec.pages[i];
> +		put_page(page);
>  
> -			if (ino && ino_of_node(page) == ino) {
> -				f2fs_wait_on_page_writeback(page, NODE, true);
> -				if (TestClearPageError(page))
> -					ret = -EIO;
> -			}
> -		}
> -		pagevec_release(&pvec);
> -		cond_resched();
> +		if (ret)
> +			break;
>  	}
>  
> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
> -	if (!ret)
> -		ret = ret2;
>  	return ret;
>  }
>  
> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
>  			sizeof(struct nat_entry_set));
>  	if (!nat_entry_set_slab)
>  		goto destroy_free_nid;
> +
> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
> +			sizeof(struct fsync_node_entry));
> +	if (!fsync_node_entry_slab)
> +		goto destroy_nat_entry_set;
>  	return 0;
>  
> +destroy_nat_entry_set:
> +	kmem_cache_destroy(nat_entry_set_slab);
>  destroy_free_nid:
>  	kmem_cache_destroy(free_nid_slab);
>  destroy_nat_entry:
> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
>  
>  void f2fs_destroy_node_manager_caches(void)
>  {
> +	kmem_cache_destroy(fsync_node_entry_slab);
>  	kmem_cache_destroy(nat_entry_set_slab);
>  	kmem_cache_destroy(free_nid_slab);
>  	kmem_cache_destroy(nat_entry_slab);
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index 143ed321076e..34321932754d 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
>  	 */
>  	f2fs_release_ino_entry(sbi, true);
>  
> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
> +
>  	f2fs_leave_shrinker(sbi);
>  	mutex_unlock(&sbi->umount_mutex);
>  
> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>  
>  	f2fs_init_ino_entry_info(sbi);
>  
> +	f2fs_init_fsync_node_info(sbi);
> +
>  	/* setup f2fs internal modules */
>  	err = f2fs_build_segment_manager(sbi);
>  	if (err) {
> -- 
> 2.16.2.17.g38e79b1fd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-14  5:44 ` [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Jaegeuk Kim
@ 2018-07-15  1:30   ` Chao Yu
  2018-07-20 10:45     ` Chao Yu
  0 siblings, 1 reply; 8+ messages in thread
From: Chao Yu @ 2018-07-15  1:30 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-f2fs-devel, linux-kernel, Chao Yu

On 2018/7/14 13:44, Jaegeuk Kim wrote:
> On 07/08, Chao Yu wrote:
>> From: Chao Yu <yuchao0@huawei.com>
>>
>> f2fs recovery flow is relying on dnode block link list, it means fsynced
>> file recovery depends on previous dnode's persistence in the list, so
>> during fsync() we should wait on all regular inode's dnode writebacked
>> before issuing flush.
>>
>> By this way, we can avoid dnode block list being broken by out-of-order
>> IO submission due to IO scheduler or driver.
> 
> Hi Chao,
> 
> Just in case, can we measure some performance numbers with this?

Hi Jaegeuk,

OK, let me do some tests on this.

Thanks,

> 
> Thanks,
> 
>>
>> Signed-off-by: Chao Yu <yuchao0@huawei.com>
>> ---
>> v3:
>> - add a list to link all writebacked dnodes, let fsync() only wait on
>> necessary dnode.
>>  fs/f2fs/checkpoint.c |   2 +
>>  fs/f2fs/data.c       |   2 +
>>  fs/f2fs/f2fs.h       |  21 +++++++-
>>  fs/f2fs/file.c       |  20 +++----
>>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
>>  fs/f2fs/super.c      |   4 ++
>>  6 files changed, 153 insertions(+), 44 deletions(-)
>>
>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>> index 8b698bd54490..d5e60d76362e 100644
>> --- a/fs/f2fs/checkpoint.c
>> +++ b/fs/f2fs/checkpoint.c
>> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>  
>>  	f2fs_release_ino_entry(sbi, false);
>>  
>> +	f2fs_reset_fsync_node_info(sbi);
>> +
>>  	if (unlikely(f2fs_cp_error(sbi)))
>>  		return -EIO;
>>  
>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>> index 70813a4dda3e..afe76d87575c 100644
>> --- a/fs/f2fs/data.c
>> +++ b/fs/f2fs/data.c
>> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
>>  					page->index != nid_of_node(page));
>>  
>>  		dec_page_count(sbi, WB_DATA_TYPE(page));
>> +		if (f2fs_in_warm_node_list(sbi, page))
>> +			f2fs_del_fsync_node_entry(sbi, page);
>>  		clear_cold_data(page);
>>  		end_page_writeback(page);
>>  	}
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index c8c865fa8450..bf5f7a336ace 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -228,6 +228,12 @@ struct inode_entry {
>>  	struct inode *inode;	/* vfs inode pointer */
>>  };
>>  
>> +struct fsync_node_entry {
>> +	struct list_head list;	/* list head */
>> +	struct page *page;	/* warm node page pointer */
>> +	unsigned int seq_id;	/* sequence id */
>> +};
>> +
>>  /* for the bitmap indicate blocks to be discarded */
>>  struct discard_entry {
>>  	struct list_head list;	/* list head */
>> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
>>  
>>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
>>  
>> +	spinlock_t fsync_node_lock;		/* for node entry lock */
>> +	struct list_head fsync_node_list;	/* node list head */
>> +	unsigned int fsync_seg_id;		/* sequence id */
>> +	unsigned int fsync_node_num;		/* number of node entries */
>> +
>>  	/* for orphan inode, use 0'th array */
>>  	unsigned int max_orphans;		/* max orphan inodes */
>>  
>> @@ -2816,6 +2827,10 @@ struct node_info;
>>  
>>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
>>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
>>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
>>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
>> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
>>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
>>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
>>  int f2fs_truncate_xattr_node(struct inode *inode);
>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>> +					unsigned int seq_id);
>>  int f2fs_remove_inode_page(struct inode *inode);
>>  struct page *f2fs_new_inode_page(struct inode *inode);
>>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
>> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
>>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
>>  void f2fs_move_node_page(struct page *node_page, int gc_type);
>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>> -			struct writeback_control *wbc, bool atomic);
>> +			struct writeback_control *wbc, bool atomic,
>> +			unsigned int *seq_id);
>>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>  			struct writeback_control *wbc,
>>  			bool do_balance, enum iostat_type io_type);
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 5e29d4053748..ddea2bfd4042 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>  		.nr_to_write = LONG_MAX,
>>  		.for_reclaim = 0,
>>  	};
>> +	unsigned int seq_id = 0;
>>  
>>  	if (unlikely(f2fs_readonly(inode->i_sb)))
>>  		return 0;
>> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>  	}
>>  sync_nodes:
>>  	atomic_inc(&sbi->wb_sync_req[NODE]);
>> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
>> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
>>  	atomic_dec(&sbi->wb_sync_req[NODE]);
>>  	if (ret)
>>  		goto out;
>> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>  		goto sync_nodes;
>>  	}
>>  
>> -	/*
>> -	 * If it's atomic_write, it's just fine to keep write ordering. So
>> -	 * here we don't need to wait for node write completion, since we use
>> -	 * node chain which serializes node blocks. If one of node writes are
>> -	 * reordered, we can see simply broken chain, resulting in stopping
>> -	 * roll-forward recovery. It means we'll recover all or none node blocks
>> -	 * given fsync mark.
>> -	 */
>> -	if (!atomic) {
>> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
>> -		if (ret)
>> -			goto out;
>> -	}
>> +
>> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
>> +	if (ret)
>> +		goto out;
>>  
>>  	/* once recovery info is written, don't need to tack this */
>>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>> index 1e30bc305243..31dc372c56a0 100644
>> --- a/fs/f2fs/node.c
>> +++ b/fs/f2fs/node.c
>> @@ -28,6 +28,7 @@
>>  static struct kmem_cache *nat_entry_slab;
>>  static struct kmem_cache *free_nid_slab;
>>  static struct kmem_cache *nat_entry_set_slab;
>> +static struct kmem_cache *fsync_node_entry_slab;
>>  
>>  /*
>>   * Check whether the given nid is within node id range.
>> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
>>  							start, nr);
>>  }
>>  
>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
>> +{
>> +	return NODE_MAPPING(sbi) == page->mapping &&
>> +			IS_DNODE(page) && is_cold_node(page);
>> +}
>> +
>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
>> +{
>> +	spin_lock_init(&sbi->fsync_node_lock);
>> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
>> +	sbi->fsync_seg_id = 0;
>> +	sbi->fsync_node_num = 0;
>> +}
>> +
>> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
>> +							struct page *page)
>> +{
>> +	struct fsync_node_entry *fn;
>> +	unsigned long flags;
>> +	unsigned int seq_id;
>> +
>> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
>> +
>> +	get_page(page);
>> +	fn->page = page;
>> +	INIT_LIST_HEAD(&fn->list);
>> +
>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
>> +	fn->seq_id = sbi->fsync_seg_id++;
>> +	seq_id = fn->seq_id;
>> +	sbi->fsync_node_num++;
>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +
>> +	return seq_id;
>> +}
>> +
>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
>> +{
>> +	struct fsync_node_entry *fn;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
>> +		if (fn->page == page) {
>> +			list_del(&fn->list);
>> +			sbi->fsync_node_num--;
>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +			kmem_cache_free(fsync_node_entry_slab, fn);
>> +			put_page(page);
>> +			return;
>> +		}
>> +	}
>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +	f2fs_bug_on(sbi, 1);
>> +}
>> +
>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
>> +{
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>> +	sbi->fsync_node_num = 0;
>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +}
>> +
>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
>>  {
>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
>>  
>>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>  				struct writeback_control *wbc, bool do_balance,
>> -				enum iostat_type io_type)
>> +				enum iostat_type io_type, unsigned int *seq_id)
>>  {
>>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
>>  	nid_t nid;
>> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>  		.io_type = io_type,
>>  		.io_wbc = wbc,
>>  	};
>> +	unsigned int seq;
>>  
>>  	trace_f2fs_writepage(page, NODE);
>>  
>> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>  
>>  	set_page_writeback(page);
>>  	ClearPageError(page);
>> +
>> +	if (f2fs_in_warm_node_list(sbi, page)) {
>> +		seq = f2fs_add_fsync_node_entry(sbi, page);
>> +		if (seq_id)
>> +			*seq_id = seq;
>> +	}
>> +
>>  	fio.old_blkaddr = ni.blk_addr;
>>  	f2fs_do_write_node_page(nid, &fio);
>>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
>> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>  			goto out_page;
>>  
>>  		if (__write_node_page(node_page, false, NULL,
>> -					&wbc, false, FS_GC_NODE_IO))
>> +					&wbc, false, FS_GC_NODE_IO, NULL))
>>  			unlock_page(node_page);
>>  		goto release_page;
>>  	} else {
>> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>  static int f2fs_write_node_page(struct page *page,
>>  				struct writeback_control *wbc)
>>  {
>> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
>> +	return __write_node_page(page, false, NULL, wbc, false,
>> +						FS_NODE_IO, NULL);
>>  }
>>  
>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>> -			struct writeback_control *wbc, bool atomic)
>> +			struct writeback_control *wbc, bool atomic,
>> +			unsigned int *seq_id)
>>  {
>>  	pgoff_t index;
>>  	pgoff_t last_idx = ULONG_MAX;
>> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>  			ret = __write_node_page(page, atomic &&
>>  						page == last_page,
>>  						&submitted, wbc, true,
>> -						FS_NODE_IO);
>> +						FS_NODE_IO, seq_id);
>>  			if (ret) {
>>  				unlock_page(page);
>>  				f2fs_put_page(last_page, 0);
>> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>  			set_dentry_mark(page, 0);
>>  
>>  			ret = __write_node_page(page, false, &submitted,
>> -						wbc, do_balance, io_type);
>> +						wbc, do_balance, io_type, NULL);
>>  			if (ret)
>>  				unlock_page(page);
>>  			else if (submitted)
>> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>  	return ret;
>>  }
>>  
>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>> +						unsigned int seq_id)
>>  {
>> -	pgoff_t index = 0;
>> -	struct pagevec pvec;
>> -	int ret2, ret = 0;
>> -	int nr_pages;
>> +	struct fsync_node_entry *fn;
>> +	struct page *page;
>> +	struct list_head *head = &sbi->fsync_node_list;
>> +	unsigned long flags;
>> +	unsigned int cur_seq_id = 0;
>> +	int ret = 0;
>>  
>> -	pagevec_init(&pvec);
>> +	while (seq_id && cur_seq_id < seq_id) {
>> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>> +		if (list_empty(head)) {
>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +			break;
>> +		}
>> +		fn = list_first_entry(head, struct fsync_node_entry, list);
>> +		if (fn->seq_id > seq_id) {
>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>> +			break;
>> +		}
>> +		cur_seq_id = fn->seq_id;
>> +		page = fn->page;
>> +		get_page(page);
>> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>  
>> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
>> -				PAGECACHE_TAG_WRITEBACK))) {
>> -		int i;
>> +		f2fs_wait_on_page_writeback(page, NODE, true);
>> +		if (TestClearPageError(page))
>> +			ret = -EIO;
>>  
>> -		for (i = 0; i < nr_pages; i++) {
>> -			struct page *page = pvec.pages[i];
>> +		put_page(page);
>>  
>> -			if (ino && ino_of_node(page) == ino) {
>> -				f2fs_wait_on_page_writeback(page, NODE, true);
>> -				if (TestClearPageError(page))
>> -					ret = -EIO;
>> -			}
>> -		}
>> -		pagevec_release(&pvec);
>> -		cond_resched();
>> +		if (ret)
>> +			break;
>>  	}
>>  
>> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
>> -	if (!ret)
>> -		ret = ret2;
>>  	return ret;
>>  }
>>  
>> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
>>  			sizeof(struct nat_entry_set));
>>  	if (!nat_entry_set_slab)
>>  		goto destroy_free_nid;
>> +
>> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
>> +			sizeof(struct fsync_node_entry));
>> +	if (!fsync_node_entry_slab)
>> +		goto destroy_nat_entry_set;
>>  	return 0;
>>  
>> +destroy_nat_entry_set:
>> +	kmem_cache_destroy(nat_entry_set_slab);
>>  destroy_free_nid:
>>  	kmem_cache_destroy(free_nid_slab);
>>  destroy_nat_entry:
>> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
>>  
>>  void f2fs_destroy_node_manager_caches(void)
>>  {
>> +	kmem_cache_destroy(fsync_node_entry_slab);
>>  	kmem_cache_destroy(nat_entry_set_slab);
>>  	kmem_cache_destroy(free_nid_slab);
>>  	kmem_cache_destroy(nat_entry_slab);
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index 143ed321076e..34321932754d 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
>>  	 */
>>  	f2fs_release_ino_entry(sbi, true);
>>  
>> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
>> +
>>  	f2fs_leave_shrinker(sbi);
>>  	mutex_unlock(&sbi->umount_mutex);
>>  
>> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>>  
>>  	f2fs_init_ino_entry_info(sbi);
>>  
>> +	f2fs_init_fsync_node_info(sbi);
>> +
>>  	/* setup f2fs internal modules */
>>  	err = f2fs_build_segment_manager(sbi);
>>  	if (err) {
>> -- 
>> 2.16.2.17.g38e79b1fd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-15  1:30   ` Chao Yu
@ 2018-07-20 10:45     ` Chao Yu
  2018-07-21 10:29       ` Chao Yu
  2018-07-23 12:25       ` Jaegeuk Kim
  0 siblings, 2 replies; 8+ messages in thread
From: Chao Yu @ 2018-07-20 10:45 UTC (permalink / raw)
  To: Chao Yu, Jaegeuk Kim; +Cc: linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

On 2018/7/15 9:30, Chao Yu wrote:
> On 2018/7/14 13:44, Jaegeuk Kim wrote:
>> On 07/08, Chao Yu wrote:
>>> From: Chao Yu <yuchao0@huawei.com>
>>>
>>> f2fs recovery flow is relying on dnode block link list, it means fsynced
>>> file recovery depends on previous dnode's persistence in the list, so
>>> during fsync() we should wait on all regular inode's dnode writebacked
>>> before issuing flush.
>>>
>>> By this way, we can avoid dnode block list being broken by out-of-order
>>> IO submission due to IO scheduler or driver.
>>
>> Hi Chao,
>>
>> Just in case, can we measure some performance numbers with this?
> 
> Hi Jaegeuk,
> 
> OK, let me do some tests on this.

Sheng Yong helps to do the test with this patch:

# Microbenchmark
Target:/data (f2fs, -)
64MB / 32768KB / 4KB / 8

# SQLite benchmark
1 / PERSIST / Index

Base:
	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
1	867.82		204.15		41440.03	41370.54	680.8		1025.94		1031.08
2	871.87		205.87		41370.3		40275.2		791.14		1065.84		1101.7
3	866.52		205.69		41795.67	40596.16	694.69		1037.16		1031.48
Avg	868.7366667	205.2366667	41535.33333	40747.3		722.21		1042.98		1054.753333
							
After:
	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
1	798.81		202.5		41143		40613.87	602.71		838.08		913.83
2	805.79		206.47		40297.2		41291.46	604.44		840.75		924.27
3	814.83		206.17		41209.57	40453.62	602.85		834.66		927.91
Avg	806.4766667	205.0466667	40883.25667	40786.31667	603.3333333	837.83		922.0033333

Orig/Patched:							
	0.928332713	0.999074239	0.984300676	1.000957528	0.835398753	0.803303994	0.874141189

It looks like atomic write will suffer performance regression.

I suspect that the criminal is that we forcing to wait all dnode being in
storage cache before we issue PREFLUSH+FUA.

BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
cause the problem: we will lose data of last transaction after SPO, even if
atomic write return no error:

- atomic_open();
- write() P1, P2, P3;
- atomic_commit();
 - writeback data: P1, P2, P3;
 - writeback node: N1, N2, N3;  <--- If N1, N2 is not writebacked, N3 with fsync_mark is
writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
last transaction.
 - preflush + fua;
- power-cut

Thanks,

> 
> Thanks,
> 
>>
>> Thanks,
>>
>>>
>>> Signed-off-by: Chao Yu <yuchao0@huawei.com>
>>> ---
>>> v3:
>>> - add a list to link all writebacked dnodes, let fsync() only wait on
>>> necessary dnode.
>>>  fs/f2fs/checkpoint.c |   2 +
>>>  fs/f2fs/data.c       |   2 +
>>>  fs/f2fs/f2fs.h       |  21 +++++++-
>>>  fs/f2fs/file.c       |  20 +++----
>>>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
>>>  fs/f2fs/super.c      |   4 ++
>>>  6 files changed, 153 insertions(+), 44 deletions(-)
>>>
>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>> index 8b698bd54490..d5e60d76362e 100644
>>> --- a/fs/f2fs/checkpoint.c
>>> +++ b/fs/f2fs/checkpoint.c
>>> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  
>>>  	f2fs_release_ino_entry(sbi, false);
>>>  
>>> +	f2fs_reset_fsync_node_info(sbi);
>>> +
>>>  	if (unlikely(f2fs_cp_error(sbi)))
>>>  		return -EIO;
>>>  
>>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>>> index 70813a4dda3e..afe76d87575c 100644
>>> --- a/fs/f2fs/data.c
>>> +++ b/fs/f2fs/data.c
>>> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
>>>  					page->index != nid_of_node(page));
>>>  
>>>  		dec_page_count(sbi, WB_DATA_TYPE(page));
>>> +		if (f2fs_in_warm_node_list(sbi, page))
>>> +			f2fs_del_fsync_node_entry(sbi, page);
>>>  		clear_cold_data(page);
>>>  		end_page_writeback(page);
>>>  	}
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index c8c865fa8450..bf5f7a336ace 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -228,6 +228,12 @@ struct inode_entry {
>>>  	struct inode *inode;	/* vfs inode pointer */
>>>  };
>>>  
>>> +struct fsync_node_entry {
>>> +	struct list_head list;	/* list head */
>>> +	struct page *page;	/* warm node page pointer */
>>> +	unsigned int seq_id;	/* sequence id */
>>> +};
>>> +
>>>  /* for the bitmap indicate blocks to be discarded */
>>>  struct discard_entry {
>>>  	struct list_head list;	/* list head */
>>> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
>>>  
>>>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
>>>  
>>> +	spinlock_t fsync_node_lock;		/* for node entry lock */
>>> +	struct list_head fsync_node_list;	/* node list head */
>>> +	unsigned int fsync_seg_id;		/* sequence id */
>>> +	unsigned int fsync_node_num;		/* number of node entries */
>>> +
>>>  	/* for orphan inode, use 0'th array */
>>>  	unsigned int max_orphans;		/* max orphan inodes */
>>>  
>>> @@ -2816,6 +2827,10 @@ struct node_info;
>>>  
>>>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
>>>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
>>>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
>>>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
>>> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
>>>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
>>>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
>>>  int f2fs_truncate_xattr_node(struct inode *inode);
>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>> +					unsigned int seq_id);
>>>  int f2fs_remove_inode_page(struct inode *inode);
>>>  struct page *f2fs_new_inode_page(struct inode *inode);
>>>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
>>> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
>>>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
>>>  void f2fs_move_node_page(struct page *node_page, int gc_type);
>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>> -			struct writeback_control *wbc, bool atomic);
>>> +			struct writeback_control *wbc, bool atomic,
>>> +			unsigned int *seq_id);
>>>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>  			struct writeback_control *wbc,
>>>  			bool do_balance, enum iostat_type io_type);
>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>> index 5e29d4053748..ddea2bfd4042 100644
>>> --- a/fs/f2fs/file.c
>>> +++ b/fs/f2fs/file.c
>>> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>  		.nr_to_write = LONG_MAX,
>>>  		.for_reclaim = 0,
>>>  	};
>>> +	unsigned int seq_id = 0;
>>>  
>>>  	if (unlikely(f2fs_readonly(inode->i_sb)))
>>>  		return 0;
>>> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>  	}
>>>  sync_nodes:
>>>  	atomic_inc(&sbi->wb_sync_req[NODE]);
>>> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
>>> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
>>>  	atomic_dec(&sbi->wb_sync_req[NODE]);
>>>  	if (ret)
>>>  		goto out;
>>> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>  		goto sync_nodes;
>>>  	}
>>>  
>>> -	/*
>>> -	 * If it's atomic_write, it's just fine to keep write ordering. So
>>> -	 * here we don't need to wait for node write completion, since we use
>>> -	 * node chain which serializes node blocks. If one of node writes are
>>> -	 * reordered, we can see simply broken chain, resulting in stopping
>>> -	 * roll-forward recovery. It means we'll recover all or none node blocks
>>> -	 * given fsync mark.
>>> -	 */
>>> -	if (!atomic) {
>>> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
>>> -		if (ret)
>>> -			goto out;
>>> -	}
>>> +
>>> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
>>> +	if (ret)
>>> +		goto out;
>>>  
>>>  	/* once recovery info is written, don't need to tack this */
>>>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
>>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>>> index 1e30bc305243..31dc372c56a0 100644
>>> --- a/fs/f2fs/node.c
>>> +++ b/fs/f2fs/node.c
>>> @@ -28,6 +28,7 @@
>>>  static struct kmem_cache *nat_entry_slab;
>>>  static struct kmem_cache *free_nid_slab;
>>>  static struct kmem_cache *nat_entry_set_slab;
>>> +static struct kmem_cache *fsync_node_entry_slab;
>>>  
>>>  /*
>>>   * Check whether the given nid is within node id range.
>>> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
>>>  							start, nr);
>>>  }
>>>  
>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
>>> +{
>>> +	return NODE_MAPPING(sbi) == page->mapping &&
>>> +			IS_DNODE(page) && is_cold_node(page);
>>> +}
>>> +
>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
>>> +{
>>> +	spin_lock_init(&sbi->fsync_node_lock);
>>> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
>>> +	sbi->fsync_seg_id = 0;
>>> +	sbi->fsync_node_num = 0;
>>> +}
>>> +
>>> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
>>> +							struct page *page)
>>> +{
>>> +	struct fsync_node_entry *fn;
>>> +	unsigned long flags;
>>> +	unsigned int seq_id;
>>> +
>>> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
>>> +
>>> +	get_page(page);
>>> +	fn->page = page;
>>> +	INIT_LIST_HEAD(&fn->list);
>>> +
>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
>>> +	fn->seq_id = sbi->fsync_seg_id++;
>>> +	seq_id = fn->seq_id;
>>> +	sbi->fsync_node_num++;
>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +
>>> +	return seq_id;
>>> +}
>>> +
>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
>>> +{
>>> +	struct fsync_node_entry *fn;
>>> +	unsigned long flags;
>>> +
>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
>>> +		if (fn->page == page) {
>>> +			list_del(&fn->list);
>>> +			sbi->fsync_node_num--;
>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +			kmem_cache_free(fsync_node_entry_slab, fn);
>>> +			put_page(page);
>>> +			return;
>>> +		}
>>> +	}
>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +	f2fs_bug_on(sbi, 1);
>>> +}
>>> +
>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
>>> +{
>>> +	unsigned long flags;
>>> +
>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>> +	sbi->fsync_node_num = 0;
>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +}
>>> +
>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
>>>  {
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
>>>  
>>>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>  				struct writeback_control *wbc, bool do_balance,
>>> -				enum iostat_type io_type)
>>> +				enum iostat_type io_type, unsigned int *seq_id)
>>>  {
>>>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
>>>  	nid_t nid;
>>> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>  		.io_type = io_type,
>>>  		.io_wbc = wbc,
>>>  	};
>>> +	unsigned int seq;
>>>  
>>>  	trace_f2fs_writepage(page, NODE);
>>>  
>>> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>  
>>>  	set_page_writeback(page);
>>>  	ClearPageError(page);
>>> +
>>> +	if (f2fs_in_warm_node_list(sbi, page)) {
>>> +		seq = f2fs_add_fsync_node_entry(sbi, page);
>>> +		if (seq_id)
>>> +			*seq_id = seq;
>>> +	}
>>> +
>>>  	fio.old_blkaddr = ni.blk_addr;
>>>  	f2fs_do_write_node_page(nid, &fio);
>>>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
>>> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>  			goto out_page;
>>>  
>>>  		if (__write_node_page(node_page, false, NULL,
>>> -					&wbc, false, FS_GC_NODE_IO))
>>> +					&wbc, false, FS_GC_NODE_IO, NULL))
>>>  			unlock_page(node_page);
>>>  		goto release_page;
>>>  	} else {
>>> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>  static int f2fs_write_node_page(struct page *page,
>>>  				struct writeback_control *wbc)
>>>  {
>>> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
>>> +	return __write_node_page(page, false, NULL, wbc, false,
>>> +						FS_NODE_IO, NULL);
>>>  }
>>>  
>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>> -			struct writeback_control *wbc, bool atomic)
>>> +			struct writeback_control *wbc, bool atomic,
>>> +			unsigned int *seq_id)
>>>  {
>>>  	pgoff_t index;
>>>  	pgoff_t last_idx = ULONG_MAX;
>>> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>  			ret = __write_node_page(page, atomic &&
>>>  						page == last_page,
>>>  						&submitted, wbc, true,
>>> -						FS_NODE_IO);
>>> +						FS_NODE_IO, seq_id);
>>>  			if (ret) {
>>>  				unlock_page(page);
>>>  				f2fs_put_page(last_page, 0);
>>> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>  			set_dentry_mark(page, 0);
>>>  
>>>  			ret = __write_node_page(page, false, &submitted,
>>> -						wbc, do_balance, io_type);
>>> +						wbc, do_balance, io_type, NULL);
>>>  			if (ret)
>>>  				unlock_page(page);
>>>  			else if (submitted)
>>> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>  	return ret;
>>>  }
>>>  
>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>> +						unsigned int seq_id)
>>>  {
>>> -	pgoff_t index = 0;
>>> -	struct pagevec pvec;
>>> -	int ret2, ret = 0;
>>> -	int nr_pages;
>>> +	struct fsync_node_entry *fn;
>>> +	struct page *page;
>>> +	struct list_head *head = &sbi->fsync_node_list;
>>> +	unsigned long flags;
>>> +	unsigned int cur_seq_id = 0;
>>> +	int ret = 0;
>>>  
>>> -	pagevec_init(&pvec);
>>> +	while (seq_id && cur_seq_id < seq_id) {
>>> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>> +		if (list_empty(head)) {
>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +			break;
>>> +		}
>>> +		fn = list_first_entry(head, struct fsync_node_entry, list);
>>> +		if (fn->seq_id > seq_id) {
>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>> +			break;
>>> +		}
>>> +		cur_seq_id = fn->seq_id;
>>> +		page = fn->page;
>>> +		get_page(page);
>>> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>  
>>> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
>>> -				PAGECACHE_TAG_WRITEBACK))) {
>>> -		int i;
>>> +		f2fs_wait_on_page_writeback(page, NODE, true);
>>> +		if (TestClearPageError(page))
>>> +			ret = -EIO;
>>>  
>>> -		for (i = 0; i < nr_pages; i++) {
>>> -			struct page *page = pvec.pages[i];
>>> +		put_page(page);
>>>  
>>> -			if (ino && ino_of_node(page) == ino) {
>>> -				f2fs_wait_on_page_writeback(page, NODE, true);
>>> -				if (TestClearPageError(page))
>>> -					ret = -EIO;
>>> -			}
>>> -		}
>>> -		pagevec_release(&pvec);
>>> -		cond_resched();
>>> +		if (ret)
>>> +			break;
>>>  	}
>>>  
>>> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
>>> -	if (!ret)
>>> -		ret = ret2;
>>>  	return ret;
>>>  }
>>>  
>>> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
>>>  			sizeof(struct nat_entry_set));
>>>  	if (!nat_entry_set_slab)
>>>  		goto destroy_free_nid;
>>> +
>>> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
>>> +			sizeof(struct fsync_node_entry));
>>> +	if (!fsync_node_entry_slab)
>>> +		goto destroy_nat_entry_set;
>>>  	return 0;
>>>  
>>> +destroy_nat_entry_set:
>>> +	kmem_cache_destroy(nat_entry_set_slab);
>>>  destroy_free_nid:
>>>  	kmem_cache_destroy(free_nid_slab);
>>>  destroy_nat_entry:
>>> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
>>>  
>>>  void f2fs_destroy_node_manager_caches(void)
>>>  {
>>> +	kmem_cache_destroy(fsync_node_entry_slab);
>>>  	kmem_cache_destroy(nat_entry_set_slab);
>>>  	kmem_cache_destroy(free_nid_slab);
>>>  	kmem_cache_destroy(nat_entry_slab);
>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>> index 143ed321076e..34321932754d 100644
>>> --- a/fs/f2fs/super.c
>>> +++ b/fs/f2fs/super.c
>>> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
>>>  	 */
>>>  	f2fs_release_ino_entry(sbi, true);
>>>  
>>> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
>>> +
>>>  	f2fs_leave_shrinker(sbi);
>>>  	mutex_unlock(&sbi->umount_mutex);
>>>  
>>> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>>>  
>>>  	f2fs_init_ino_entry_info(sbi);
>>>  
>>> +	f2fs_init_fsync_node_info(sbi);
>>> +
>>>  	/* setup f2fs internal modules */
>>>  	err = f2fs_build_segment_manager(sbi);
>>>  	if (err) {
>>> -- 
>>> 2.16.2.17.g38e79b1fd
> 
> .
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-20 10:45     ` Chao Yu
@ 2018-07-21 10:29       ` Chao Yu
  2018-07-23 12:25       ` Jaegeuk Kim
  1 sibling, 0 replies; 8+ messages in thread
From: Chao Yu @ 2018-07-21 10:29 UTC (permalink / raw)
  To: Chao Yu, Jaegeuk Kim; +Cc: linux-f2fs-devel, linux-kernel

On 2018/7/20 18:45, Chao Yu wrote:
> Hi Jaegeuk,
> 
> On 2018/7/15 9:30, Chao Yu wrote:
>> On 2018/7/14 13:44, Jaegeuk Kim wrote:
>>> On 07/08, Chao Yu wrote:
>>>> From: Chao Yu <yuchao0@huawei.com>
>>>>
>>>> f2fs recovery flow is relying on dnode block link list, it means fsynced
>>>> file recovery depends on previous dnode's persistence in the list, so
>>>> during fsync() we should wait on all regular inode's dnode writebacked
>>>> before issuing flush.
>>>>
>>>> By this way, we can avoid dnode block list being broken by out-of-order
>>>> IO submission due to IO scheduler or driver.
>>>
>>> Hi Chao,
>>>
>>> Just in case, can we measure some performance numbers with this?
>>
>> Hi Jaegeuk,
>>
>> OK, let me do some tests on this.
> 
> Sheng Yong helps to do the test with this patch:
> 
> # Microbenchmark
> Target:/data (f2fs, -)
> 64MB / 32768KB / 4KB / 8
> 
> # SQLite benchmark
> 1 / PERSIST / Index
> 
> Base:
> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
> 1	867.82		204.15		41440.03	41370.54	680.8		1025.94		1031.08
> 2	871.87		205.87		41370.3		40275.2		791.14		1065.84		1101.7
> 3	866.52		205.69		41795.67	40596.16	694.69		1037.16		1031.48
> Avg	868.7366667	205.2366667	41535.33333	40747.3		722.21		1042.98		1054.753333

If we don't wait dnode writeback for atomic_write:

	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
1	779.91		206.03		41621.5		40333.16	716.9		1038.21		1034.85
2	848.51		204.35		40082.44	39486.17	791.83		1119.96		1083.77
3	772.12		206.27		41335.25	41599.65	723.29		1055.07		971.92
Avg	800.18		205.55		41013.06333	40472.99333	744.0066667	1071.08		1030.18

Patched/Orig:						
	0.92108464	1.001526693	0.987425886	0.993268102	1.030180511	1.026942031	0.976702294

SQLite's performance recovers.

> 							
> After:
> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
> 1	798.81		202.5		41143		40613.87	602.71		838.08		913.83
> 2	805.79		206.47		40297.2		41291.46	604.44		840.75		924.27
> 3	814.83		206.17		41209.57	40453.62	602.85		834.66		927.91
> Avg	806.4766667	205.0466667	40883.25667	40786.31667	603.3333333	837.83		922.0033333
> 
> Orig/Patched:							

Sorry, should be Patched/Orig.

Thanks,

> 	0.928332713	0.999074239	0.984300676	1.000957528	0.835398753	0.803303994	0.874141189
> 
> It looks like atomic write will suffer performance regression.
> 
> I suspect that the criminal is that we forcing to wait all dnode being in
> storage cache before we issue PREFLUSH+FUA.
> 
> BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
> cause the problem: we will lose data of last transaction after SPO, even if
> atomic write return no error:
> 
> - atomic_open();
> - write() P1, P2, P3;
> - atomic_commit();
>  - writeback data: P1, P2, P3;
>  - writeback node: N1, N2, N3;  <--- If N1, N2 is not writebacked, N3 with fsync_mark is
> writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
> last transaction.
>  - preflush + fua;
> - power-cut
> 
> Thanks,
> 
>>
>> Thanks,
>>
>>>
>>> Thanks,
>>>
>>>>
>>>> Signed-off-by: Chao Yu <yuchao0@huawei.com>
>>>> ---
>>>> v3:
>>>> - add a list to link all writebacked dnodes, let fsync() only wait on
>>>> necessary dnode.
>>>>  fs/f2fs/checkpoint.c |   2 +
>>>>  fs/f2fs/data.c       |   2 +
>>>>  fs/f2fs/f2fs.h       |  21 +++++++-
>>>>  fs/f2fs/file.c       |  20 +++----
>>>>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
>>>>  fs/f2fs/super.c      |   4 ++
>>>>  6 files changed, 153 insertions(+), 44 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>>> index 8b698bd54490..d5e60d76362e 100644
>>>> --- a/fs/f2fs/checkpoint.c
>>>> +++ b/fs/f2fs/checkpoint.c
>>>> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>  
>>>>  	f2fs_release_ino_entry(sbi, false);
>>>>  
>>>> +	f2fs_reset_fsync_node_info(sbi);
>>>> +
>>>>  	if (unlikely(f2fs_cp_error(sbi)))
>>>>  		return -EIO;
>>>>  
>>>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>>>> index 70813a4dda3e..afe76d87575c 100644
>>>> --- a/fs/f2fs/data.c
>>>> +++ b/fs/f2fs/data.c
>>>> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
>>>>  					page->index != nid_of_node(page));
>>>>  
>>>>  		dec_page_count(sbi, WB_DATA_TYPE(page));
>>>> +		if (f2fs_in_warm_node_list(sbi, page))
>>>> +			f2fs_del_fsync_node_entry(sbi, page);
>>>>  		clear_cold_data(page);
>>>>  		end_page_writeback(page);
>>>>  	}
>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>> index c8c865fa8450..bf5f7a336ace 100644
>>>> --- a/fs/f2fs/f2fs.h
>>>> +++ b/fs/f2fs/f2fs.h
>>>> @@ -228,6 +228,12 @@ struct inode_entry {
>>>>  	struct inode *inode;	/* vfs inode pointer */
>>>>  };
>>>>  
>>>> +struct fsync_node_entry {
>>>> +	struct list_head list;	/* list head */
>>>> +	struct page *page;	/* warm node page pointer */
>>>> +	unsigned int seq_id;	/* sequence id */
>>>> +};
>>>> +
>>>>  /* for the bitmap indicate blocks to be discarded */
>>>>  struct discard_entry {
>>>>  	struct list_head list;	/* list head */
>>>> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
>>>>  
>>>>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
>>>>  
>>>> +	spinlock_t fsync_node_lock;		/* for node entry lock */
>>>> +	struct list_head fsync_node_list;	/* node list head */
>>>> +	unsigned int fsync_seg_id;		/* sequence id */
>>>> +	unsigned int fsync_node_num;		/* number of node entries */
>>>> +
>>>>  	/* for orphan inode, use 0'th array */
>>>>  	unsigned int max_orphans;		/* max orphan inodes */
>>>>  
>>>> @@ -2816,6 +2827,10 @@ struct node_info;
>>>>  
>>>>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
>>>>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
>>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
>>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
>>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
>>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
>>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
>>>>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
>>>>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
>>>> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
>>>>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
>>>>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
>>>>  int f2fs_truncate_xattr_node(struct inode *inode);
>>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
>>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>>> +					unsigned int seq_id);
>>>>  int f2fs_remove_inode_page(struct inode *inode);
>>>>  struct page *f2fs_new_inode_page(struct inode *inode);
>>>>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
>>>> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
>>>>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
>>>>  void f2fs_move_node_page(struct page *node_page, int gc_type);
>>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>> -			struct writeback_control *wbc, bool atomic);
>>>> +			struct writeback_control *wbc, bool atomic,
>>>> +			unsigned int *seq_id);
>>>>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>  			struct writeback_control *wbc,
>>>>  			bool do_balance, enum iostat_type io_type);
>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>> index 5e29d4053748..ddea2bfd4042 100644
>>>> --- a/fs/f2fs/file.c
>>>> +++ b/fs/f2fs/file.c
>>>> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>  		.nr_to_write = LONG_MAX,
>>>>  		.for_reclaim = 0,
>>>>  	};
>>>> +	unsigned int seq_id = 0;
>>>>  
>>>>  	if (unlikely(f2fs_readonly(inode->i_sb)))
>>>>  		return 0;
>>>> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>  	}
>>>>  sync_nodes:
>>>>  	atomic_inc(&sbi->wb_sync_req[NODE]);
>>>> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
>>>> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
>>>>  	atomic_dec(&sbi->wb_sync_req[NODE]);
>>>>  	if (ret)
>>>>  		goto out;
>>>> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>  		goto sync_nodes;
>>>>  	}
>>>>  
>>>> -	/*
>>>> -	 * If it's atomic_write, it's just fine to keep write ordering. So
>>>> -	 * here we don't need to wait for node write completion, since we use
>>>> -	 * node chain which serializes node blocks. If one of node writes are
>>>> -	 * reordered, we can see simply broken chain, resulting in stopping
>>>> -	 * roll-forward recovery. It means we'll recover all or none node blocks
>>>> -	 * given fsync mark.
>>>> -	 */
>>>> -	if (!atomic) {
>>>> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
>>>> -		if (ret)
>>>> -			goto out;
>>>> -	}
>>>> +
>>>> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
>>>> +	if (ret)
>>>> +		goto out;
>>>>  
>>>>  	/* once recovery info is written, don't need to tack this */
>>>>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
>>>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>>>> index 1e30bc305243..31dc372c56a0 100644
>>>> --- a/fs/f2fs/node.c
>>>> +++ b/fs/f2fs/node.c
>>>> @@ -28,6 +28,7 @@
>>>>  static struct kmem_cache *nat_entry_slab;
>>>>  static struct kmem_cache *free_nid_slab;
>>>>  static struct kmem_cache *nat_entry_set_slab;
>>>> +static struct kmem_cache *fsync_node_entry_slab;
>>>>  
>>>>  /*
>>>>   * Check whether the given nid is within node id range.
>>>> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
>>>>  							start, nr);
>>>>  }
>>>>  
>>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
>>>> +{
>>>> +	return NODE_MAPPING(sbi) == page->mapping &&
>>>> +			IS_DNODE(page) && is_cold_node(page);
>>>> +}
>>>> +
>>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
>>>> +{
>>>> +	spin_lock_init(&sbi->fsync_node_lock);
>>>> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
>>>> +	sbi->fsync_seg_id = 0;
>>>> +	sbi->fsync_node_num = 0;
>>>> +}
>>>> +
>>>> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
>>>> +							struct page *page)
>>>> +{
>>>> +	struct fsync_node_entry *fn;
>>>> +	unsigned long flags;
>>>> +	unsigned int seq_id;
>>>> +
>>>> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
>>>> +
>>>> +	get_page(page);
>>>> +	fn->page = page;
>>>> +	INIT_LIST_HEAD(&fn->list);
>>>> +
>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
>>>> +	fn->seq_id = sbi->fsync_seg_id++;
>>>> +	seq_id = fn->seq_id;
>>>> +	sbi->fsync_node_num++;
>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +
>>>> +	return seq_id;
>>>> +}
>>>> +
>>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
>>>> +{
>>>> +	struct fsync_node_entry *fn;
>>>> +	unsigned long flags;
>>>> +
>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
>>>> +		if (fn->page == page) {
>>>> +			list_del(&fn->list);
>>>> +			sbi->fsync_node_num--;
>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +			kmem_cache_free(fsync_node_entry_slab, fn);
>>>> +			put_page(page);
>>>> +			return;
>>>> +		}
>>>> +	}
>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +	f2fs_bug_on(sbi, 1);
>>>> +}
>>>> +
>>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
>>>> +{
>>>> +	unsigned long flags;
>>>> +
>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>> +	sbi->fsync_node_num = 0;
>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +}
>>>> +
>>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
>>>>  {
>>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
>>>>  
>>>>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>  				struct writeback_control *wbc, bool do_balance,
>>>> -				enum iostat_type io_type)
>>>> +				enum iostat_type io_type, unsigned int *seq_id)
>>>>  {
>>>>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
>>>>  	nid_t nid;
>>>> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>  		.io_type = io_type,
>>>>  		.io_wbc = wbc,
>>>>  	};
>>>> +	unsigned int seq;
>>>>  
>>>>  	trace_f2fs_writepage(page, NODE);
>>>>  
>>>> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>  
>>>>  	set_page_writeback(page);
>>>>  	ClearPageError(page);
>>>> +
>>>> +	if (f2fs_in_warm_node_list(sbi, page)) {
>>>> +		seq = f2fs_add_fsync_node_entry(sbi, page);
>>>> +		if (seq_id)
>>>> +			*seq_id = seq;
>>>> +	}
>>>> +
>>>>  	fio.old_blkaddr = ni.blk_addr;
>>>>  	f2fs_do_write_node_page(nid, &fio);
>>>>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
>>>> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>>  			goto out_page;
>>>>  
>>>>  		if (__write_node_page(node_page, false, NULL,
>>>> -					&wbc, false, FS_GC_NODE_IO))
>>>> +					&wbc, false, FS_GC_NODE_IO, NULL))
>>>>  			unlock_page(node_page);
>>>>  		goto release_page;
>>>>  	} else {
>>>> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>>  static int f2fs_write_node_page(struct page *page,
>>>>  				struct writeback_control *wbc)
>>>>  {
>>>> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
>>>> +	return __write_node_page(page, false, NULL, wbc, false,
>>>> +						FS_NODE_IO, NULL);
>>>>  }
>>>>  
>>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>> -			struct writeback_control *wbc, bool atomic)
>>>> +			struct writeback_control *wbc, bool atomic,
>>>> +			unsigned int *seq_id)
>>>>  {
>>>>  	pgoff_t index;
>>>>  	pgoff_t last_idx = ULONG_MAX;
>>>> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>>  			ret = __write_node_page(page, atomic &&
>>>>  						page == last_page,
>>>>  						&submitted, wbc, true,
>>>> -						FS_NODE_IO);
>>>> +						FS_NODE_IO, seq_id);
>>>>  			if (ret) {
>>>>  				unlock_page(page);
>>>>  				f2fs_put_page(last_page, 0);
>>>> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>  			set_dentry_mark(page, 0);
>>>>  
>>>>  			ret = __write_node_page(page, false, &submitted,
>>>> -						wbc, do_balance, io_type);
>>>> +						wbc, do_balance, io_type, NULL);
>>>>  			if (ret)
>>>>  				unlock_page(page);
>>>>  			else if (submitted)
>>>> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>  	return ret;
>>>>  }
>>>>  
>>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
>>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>>> +						unsigned int seq_id)
>>>>  {
>>>> -	pgoff_t index = 0;
>>>> -	struct pagevec pvec;
>>>> -	int ret2, ret = 0;
>>>> -	int nr_pages;
>>>> +	struct fsync_node_entry *fn;
>>>> +	struct page *page;
>>>> +	struct list_head *head = &sbi->fsync_node_list;
>>>> +	unsigned long flags;
>>>> +	unsigned int cur_seq_id = 0;
>>>> +	int ret = 0;
>>>>  
>>>> -	pagevec_init(&pvec);
>>>> +	while (seq_id && cur_seq_id < seq_id) {
>>>> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>> +		if (list_empty(head)) {
>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +			break;
>>>> +		}
>>>> +		fn = list_first_entry(head, struct fsync_node_entry, list);
>>>> +		if (fn->seq_id > seq_id) {
>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>> +			break;
>>>> +		}
>>>> +		cur_seq_id = fn->seq_id;
>>>> +		page = fn->page;
>>>> +		get_page(page);
>>>> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>  
>>>> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
>>>> -				PAGECACHE_TAG_WRITEBACK))) {
>>>> -		int i;
>>>> +		f2fs_wait_on_page_writeback(page, NODE, true);
>>>> +		if (TestClearPageError(page))
>>>> +			ret = -EIO;
>>>>  
>>>> -		for (i = 0; i < nr_pages; i++) {
>>>> -			struct page *page = pvec.pages[i];
>>>> +		put_page(page);
>>>>  
>>>> -			if (ino && ino_of_node(page) == ino) {
>>>> -				f2fs_wait_on_page_writeback(page, NODE, true);
>>>> -				if (TestClearPageError(page))
>>>> -					ret = -EIO;
>>>> -			}
>>>> -		}
>>>> -		pagevec_release(&pvec);
>>>> -		cond_resched();
>>>> +		if (ret)
>>>> +			break;
>>>>  	}
>>>>  
>>>> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
>>>> -	if (!ret)
>>>> -		ret = ret2;
>>>>  	return ret;
>>>>  }
>>>>  
>>>> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
>>>>  			sizeof(struct nat_entry_set));
>>>>  	if (!nat_entry_set_slab)
>>>>  		goto destroy_free_nid;
>>>> +
>>>> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
>>>> +			sizeof(struct fsync_node_entry));
>>>> +	if (!fsync_node_entry_slab)
>>>> +		goto destroy_nat_entry_set;
>>>>  	return 0;
>>>>  
>>>> +destroy_nat_entry_set:
>>>> +	kmem_cache_destroy(nat_entry_set_slab);
>>>>  destroy_free_nid:
>>>>  	kmem_cache_destroy(free_nid_slab);
>>>>  destroy_nat_entry:
>>>> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
>>>>  
>>>>  void f2fs_destroy_node_manager_caches(void)
>>>>  {
>>>> +	kmem_cache_destroy(fsync_node_entry_slab);
>>>>  	kmem_cache_destroy(nat_entry_set_slab);
>>>>  	kmem_cache_destroy(free_nid_slab);
>>>>  	kmem_cache_destroy(nat_entry_slab);
>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>> index 143ed321076e..34321932754d 100644
>>>> --- a/fs/f2fs/super.c
>>>> +++ b/fs/f2fs/super.c
>>>> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
>>>>  	 */
>>>>  	f2fs_release_ino_entry(sbi, true);
>>>>  
>>>> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
>>>> +
>>>>  	f2fs_leave_shrinker(sbi);
>>>>  	mutex_unlock(&sbi->umount_mutex);
>>>>  
>>>> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>>>>  
>>>>  	f2fs_init_ino_entry_info(sbi);
>>>>  
>>>> +	f2fs_init_fsync_node_info(sbi);
>>>> +
>>>>  	/* setup f2fs internal modules */
>>>>  	err = f2fs_build_segment_manager(sbi);
>>>>  	if (err) {
>>>> -- 
>>>> 2.16.2.17.g38e79b1fd
>>
>> .
>>
> 
> 
> .
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-20 10:45     ` Chao Yu
  2018-07-21 10:29       ` Chao Yu
@ 2018-07-23 12:25       ` Jaegeuk Kim
  2018-07-23 13:17         ` Chao Yu
  1 sibling, 1 reply; 8+ messages in thread
From: Jaegeuk Kim @ 2018-07-23 12:25 UTC (permalink / raw)
  To: Chao Yu; +Cc: Chao Yu, linux-f2fs-devel, linux-kernel

On 07/20, Chao Yu wrote:
> Hi Jaegeuk,
> 
> On 2018/7/15 9:30, Chao Yu wrote:
> > On 2018/7/14 13:44, Jaegeuk Kim wrote:
> >> On 07/08, Chao Yu wrote:
> >>> From: Chao Yu <yuchao0@huawei.com>
> >>>
> >>> f2fs recovery flow is relying on dnode block link list, it means fsynced
> >>> file recovery depends on previous dnode's persistence in the list, so
> >>> during fsync() we should wait on all regular inode's dnode writebacked
> >>> before issuing flush.
> >>>
> >>> By this way, we can avoid dnode block list being broken by out-of-order
> >>> IO submission due to IO scheduler or driver.
> >>
> >> Hi Chao,
> >>
> >> Just in case, can we measure some performance numbers with this?
> > 
> > Hi Jaegeuk,
> > 
> > OK, let me do some tests on this.
> 
> Sheng Yong helps to do the test with this patch:
> 
> # Microbenchmark
> Target:/data (f2fs, -)
> 64MB / 32768KB / 4KB / 8
> 
> # SQLite benchmark
> 1 / PERSIST / Index
> 
> Base:
> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
> 1	867.82		204.15		41440.03	41370.54	680.8		1025.94		1031.08
> 2	871.87		205.87		41370.3		40275.2		791.14		1065.84		1101.7
> 3	866.52		205.69		41795.67	40596.16	694.69		1037.16		1031.48
> Avg	868.7366667	205.2366667	41535.33333	40747.3		722.21		1042.98		1054.753333
> 							
> After:
> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
> 1	798.81		202.5		41143		40613.87	602.71		838.08		913.83
> 2	805.79		206.47		40297.2		41291.46	604.44		840.75		924.27
> 3	814.83		206.17		41209.57	40453.62	602.85		834.66		927.91
> Avg	806.4766667	205.0466667	40883.25667	40786.31667	603.3333333	837.83		922.0033333
> 
> Orig/Patched:							
> 	0.928332713	0.999074239	0.984300676	1.000957528	0.835398753	0.803303994	0.874141189
> 
> It looks like atomic write will suffer performance regression.
> 
> I suspect that the criminal is that we forcing to wait all dnode being in
> storage cache before we issue PREFLUSH+FUA.
> 
> BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
> cause the problem: we will lose data of last transaction after SPO, even if
> atomic write return no error:

Practically, I don't see db corruption becase of this. We can excuse to lose
the last transaction.

Thanks,

> 
> - atomic_open();
> - write() P1, P2, P3;
> - atomic_commit();
>  - writeback data: P1, P2, P3;
>  - writeback node: N1, N2, N3;  <--- If N1, N2 is not writebacked, N3 with fsync_mark is
> writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
> last transaction.
>  - preflush + fua;
> - power-cut
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> >>
> >> Thanks,
> >>
> >>>
> >>> Signed-off-by: Chao Yu <yuchao0@huawei.com>
> >>> ---
> >>> v3:
> >>> - add a list to link all writebacked dnodes, let fsync() only wait on
> >>> necessary dnode.
> >>>  fs/f2fs/checkpoint.c |   2 +
> >>>  fs/f2fs/data.c       |   2 +
> >>>  fs/f2fs/f2fs.h       |  21 +++++++-
> >>>  fs/f2fs/file.c       |  20 +++----
> >>>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
> >>>  fs/f2fs/super.c      |   4 ++
> >>>  6 files changed, 153 insertions(+), 44 deletions(-)
> >>>
> >>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> >>> index 8b698bd54490..d5e60d76362e 100644
> >>> --- a/fs/f2fs/checkpoint.c
> >>> +++ b/fs/f2fs/checkpoint.c
> >>> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >>>  
> >>>  	f2fs_release_ino_entry(sbi, false);
> >>>  
> >>> +	f2fs_reset_fsync_node_info(sbi);
> >>> +
> >>>  	if (unlikely(f2fs_cp_error(sbi)))
> >>>  		return -EIO;
> >>>  
> >>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> >>> index 70813a4dda3e..afe76d87575c 100644
> >>> --- a/fs/f2fs/data.c
> >>> +++ b/fs/f2fs/data.c
> >>> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
> >>>  					page->index != nid_of_node(page));
> >>>  
> >>>  		dec_page_count(sbi, WB_DATA_TYPE(page));
> >>> +		if (f2fs_in_warm_node_list(sbi, page))
> >>> +			f2fs_del_fsync_node_entry(sbi, page);
> >>>  		clear_cold_data(page);
> >>>  		end_page_writeback(page);
> >>>  	}
> >>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> >>> index c8c865fa8450..bf5f7a336ace 100644
> >>> --- a/fs/f2fs/f2fs.h
> >>> +++ b/fs/f2fs/f2fs.h
> >>> @@ -228,6 +228,12 @@ struct inode_entry {
> >>>  	struct inode *inode;	/* vfs inode pointer */
> >>>  };
> >>>  
> >>> +struct fsync_node_entry {
> >>> +	struct list_head list;	/* list head */
> >>> +	struct page *page;	/* warm node page pointer */
> >>> +	unsigned int seq_id;	/* sequence id */
> >>> +};
> >>> +
> >>>  /* for the bitmap indicate blocks to be discarded */
> >>>  struct discard_entry {
> >>>  	struct list_head list;	/* list head */
> >>> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
> >>>  
> >>>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
> >>>  
> >>> +	spinlock_t fsync_node_lock;		/* for node entry lock */
> >>> +	struct list_head fsync_node_list;	/* node list head */
> >>> +	unsigned int fsync_seg_id;		/* sequence id */
> >>> +	unsigned int fsync_node_num;		/* number of node entries */
> >>> +
> >>>  	/* for orphan inode, use 0'th array */
> >>>  	unsigned int max_orphans;		/* max orphan inodes */
> >>>  
> >>> @@ -2816,6 +2827,10 @@ struct node_info;
> >>>  
> >>>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
> >>>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
> >>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
> >>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
> >>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
> >>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
> >>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
> >>>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
> >>>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
> >>> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
> >>>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
> >>>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
> >>>  int f2fs_truncate_xattr_node(struct inode *inode);
> >>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
> >>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
> >>> +					unsigned int seq_id);
> >>>  int f2fs_remove_inode_page(struct inode *inode);
> >>>  struct page *f2fs_new_inode_page(struct inode *inode);
> >>>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
> >>> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
> >>>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
> >>>  void f2fs_move_node_page(struct page *node_page, int gc_type);
> >>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> >>> -			struct writeback_control *wbc, bool atomic);
> >>> +			struct writeback_control *wbc, bool atomic,
> >>> +			unsigned int *seq_id);
> >>>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
> >>>  			struct writeback_control *wbc,
> >>>  			bool do_balance, enum iostat_type io_type);
> >>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> >>> index 5e29d4053748..ddea2bfd4042 100644
> >>> --- a/fs/f2fs/file.c
> >>> +++ b/fs/f2fs/file.c
> >>> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
> >>>  		.nr_to_write = LONG_MAX,
> >>>  		.for_reclaim = 0,
> >>>  	};
> >>> +	unsigned int seq_id = 0;
> >>>  
> >>>  	if (unlikely(f2fs_readonly(inode->i_sb)))
> >>>  		return 0;
> >>> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
> >>>  	}
> >>>  sync_nodes:
> >>>  	atomic_inc(&sbi->wb_sync_req[NODE]);
> >>> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
> >>> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
> >>>  	atomic_dec(&sbi->wb_sync_req[NODE]);
> >>>  	if (ret)
> >>>  		goto out;
> >>> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
> >>>  		goto sync_nodes;
> >>>  	}
> >>>  
> >>> -	/*
> >>> -	 * If it's atomic_write, it's just fine to keep write ordering. So
> >>> -	 * here we don't need to wait for node write completion, since we use
> >>> -	 * node chain which serializes node blocks. If one of node writes are
> >>> -	 * reordered, we can see simply broken chain, resulting in stopping
> >>> -	 * roll-forward recovery. It means we'll recover all or none node blocks
> >>> -	 * given fsync mark.
> >>> -	 */
> >>> -	if (!atomic) {
> >>> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
> >>> -		if (ret)
> >>> -			goto out;
> >>> -	}
> >>> +
> >>> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
> >>> +	if (ret)
> >>> +		goto out;
> >>>  
> >>>  	/* once recovery info is written, don't need to tack this */
> >>>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
> >>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> >>> index 1e30bc305243..31dc372c56a0 100644
> >>> --- a/fs/f2fs/node.c
> >>> +++ b/fs/f2fs/node.c
> >>> @@ -28,6 +28,7 @@
> >>>  static struct kmem_cache *nat_entry_slab;
> >>>  static struct kmem_cache *free_nid_slab;
> >>>  static struct kmem_cache *nat_entry_set_slab;
> >>> +static struct kmem_cache *fsync_node_entry_slab;
> >>>  
> >>>  /*
> >>>   * Check whether the given nid is within node id range.
> >>> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
> >>>  							start, nr);
> >>>  }
> >>>  
> >>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
> >>> +{
> >>> +	return NODE_MAPPING(sbi) == page->mapping &&
> >>> +			IS_DNODE(page) && is_cold_node(page);
> >>> +}
> >>> +
> >>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
> >>> +{
> >>> +	spin_lock_init(&sbi->fsync_node_lock);
> >>> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
> >>> +	sbi->fsync_seg_id = 0;
> >>> +	sbi->fsync_node_num = 0;
> >>> +}
> >>> +
> >>> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
> >>> +							struct page *page)
> >>> +{
> >>> +	struct fsync_node_entry *fn;
> >>> +	unsigned long flags;
> >>> +	unsigned int seq_id;
> >>> +
> >>> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
> >>> +
> >>> +	get_page(page);
> >>> +	fn->page = page;
> >>> +	INIT_LIST_HEAD(&fn->list);
> >>> +
> >>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> >>> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
> >>> +	fn->seq_id = sbi->fsync_seg_id++;
> >>> +	seq_id = fn->seq_id;
> >>> +	sbi->fsync_node_num++;
> >>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +
> >>> +	return seq_id;
> >>> +}
> >>> +
> >>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
> >>> +{
> >>> +	struct fsync_node_entry *fn;
> >>> +	unsigned long flags;
> >>> +
> >>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> >>> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
> >>> +		if (fn->page == page) {
> >>> +			list_del(&fn->list);
> >>> +			sbi->fsync_node_num--;
> >>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +			kmem_cache_free(fsync_node_entry_slab, fn);
> >>> +			put_page(page);
> >>> +			return;
> >>> +		}
> >>> +	}
> >>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +	f2fs_bug_on(sbi, 1);
> >>> +}
> >>> +
> >>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
> >>> +{
> >>> +	unsigned long flags;
> >>> +
> >>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> >>> +	sbi->fsync_node_num = 0;
> >>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +}
> >>> +
> >>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
> >>>  {
> >>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> >>> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
> >>>  
> >>>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
> >>>  				struct writeback_control *wbc, bool do_balance,
> >>> -				enum iostat_type io_type)
> >>> +				enum iostat_type io_type, unsigned int *seq_id)
> >>>  {
> >>>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
> >>>  	nid_t nid;
> >>> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
> >>>  		.io_type = io_type,
> >>>  		.io_wbc = wbc,
> >>>  	};
> >>> +	unsigned int seq;
> >>>  
> >>>  	trace_f2fs_writepage(page, NODE);
> >>>  
> >>> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
> >>>  
> >>>  	set_page_writeback(page);
> >>>  	ClearPageError(page);
> >>> +
> >>> +	if (f2fs_in_warm_node_list(sbi, page)) {
> >>> +		seq = f2fs_add_fsync_node_entry(sbi, page);
> >>> +		if (seq_id)
> >>> +			*seq_id = seq;
> >>> +	}
> >>> +
> >>>  	fio.old_blkaddr = ni.blk_addr;
> >>>  	f2fs_do_write_node_page(nid, &fio);
> >>>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
> >>> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
> >>>  			goto out_page;
> >>>  
> >>>  		if (__write_node_page(node_page, false, NULL,
> >>> -					&wbc, false, FS_GC_NODE_IO))
> >>> +					&wbc, false, FS_GC_NODE_IO, NULL))
> >>>  			unlock_page(node_page);
> >>>  		goto release_page;
> >>>  	} else {
> >>> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
> >>>  static int f2fs_write_node_page(struct page *page,
> >>>  				struct writeback_control *wbc)
> >>>  {
> >>> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
> >>> +	return __write_node_page(page, false, NULL, wbc, false,
> >>> +						FS_NODE_IO, NULL);
> >>>  }
> >>>  
> >>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> >>> -			struct writeback_control *wbc, bool atomic)
> >>> +			struct writeback_control *wbc, bool atomic,
> >>> +			unsigned int *seq_id)
> >>>  {
> >>>  	pgoff_t index;
> >>>  	pgoff_t last_idx = ULONG_MAX;
> >>> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> >>>  			ret = __write_node_page(page, atomic &&
> >>>  						page == last_page,
> >>>  						&submitted, wbc, true,
> >>> -						FS_NODE_IO);
> >>> +						FS_NODE_IO, seq_id);
> >>>  			if (ret) {
> >>>  				unlock_page(page);
> >>>  				f2fs_put_page(last_page, 0);
> >>> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
> >>>  			set_dentry_mark(page, 0);
> >>>  
> >>>  			ret = __write_node_page(page, false, &submitted,
> >>> -						wbc, do_balance, io_type);
> >>> +						wbc, do_balance, io_type, NULL);
> >>>  			if (ret)
> >>>  				unlock_page(page);
> >>>  			else if (submitted)
> >>> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
> >>>  	return ret;
> >>>  }
> >>>  
> >>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
> >>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
> >>> +						unsigned int seq_id)
> >>>  {
> >>> -	pgoff_t index = 0;
> >>> -	struct pagevec pvec;
> >>> -	int ret2, ret = 0;
> >>> -	int nr_pages;
> >>> +	struct fsync_node_entry *fn;
> >>> +	struct page *page;
> >>> +	struct list_head *head = &sbi->fsync_node_list;
> >>> +	unsigned long flags;
> >>> +	unsigned int cur_seq_id = 0;
> >>> +	int ret = 0;
> >>>  
> >>> -	pagevec_init(&pvec);
> >>> +	while (seq_id && cur_seq_id < seq_id) {
> >>> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
> >>> +		if (list_empty(head)) {
> >>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +			break;
> >>> +		}
> >>> +		fn = list_first_entry(head, struct fsync_node_entry, list);
> >>> +		if (fn->seq_id > seq_id) {
> >>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>> +			break;
> >>> +		}
> >>> +		cur_seq_id = fn->seq_id;
> >>> +		page = fn->page;
> >>> +		get_page(page);
> >>> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
> >>>  
> >>> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
> >>> -				PAGECACHE_TAG_WRITEBACK))) {
> >>> -		int i;
> >>> +		f2fs_wait_on_page_writeback(page, NODE, true);
> >>> +		if (TestClearPageError(page))
> >>> +			ret = -EIO;
> >>>  
> >>> -		for (i = 0; i < nr_pages; i++) {
> >>> -			struct page *page = pvec.pages[i];
> >>> +		put_page(page);
> >>>  
> >>> -			if (ino && ino_of_node(page) == ino) {
> >>> -				f2fs_wait_on_page_writeback(page, NODE, true);
> >>> -				if (TestClearPageError(page))
> >>> -					ret = -EIO;
> >>> -			}
> >>> -		}
> >>> -		pagevec_release(&pvec);
> >>> -		cond_resched();
> >>> +		if (ret)
> >>> +			break;
> >>>  	}
> >>>  
> >>> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
> >>> -	if (!ret)
> >>> -		ret = ret2;
> >>>  	return ret;
> >>>  }
> >>>  
> >>> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
> >>>  			sizeof(struct nat_entry_set));
> >>>  	if (!nat_entry_set_slab)
> >>>  		goto destroy_free_nid;
> >>> +
> >>> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
> >>> +			sizeof(struct fsync_node_entry));
> >>> +	if (!fsync_node_entry_slab)
> >>> +		goto destroy_nat_entry_set;
> >>>  	return 0;
> >>>  
> >>> +destroy_nat_entry_set:
> >>> +	kmem_cache_destroy(nat_entry_set_slab);
> >>>  destroy_free_nid:
> >>>  	kmem_cache_destroy(free_nid_slab);
> >>>  destroy_nat_entry:
> >>> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
> >>>  
> >>>  void f2fs_destroy_node_manager_caches(void)
> >>>  {
> >>> +	kmem_cache_destroy(fsync_node_entry_slab);
> >>>  	kmem_cache_destroy(nat_entry_set_slab);
> >>>  	kmem_cache_destroy(free_nid_slab);
> >>>  	kmem_cache_destroy(nat_entry_slab);
> >>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> >>> index 143ed321076e..34321932754d 100644
> >>> --- a/fs/f2fs/super.c
> >>> +++ b/fs/f2fs/super.c
> >>> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
> >>>  	 */
> >>>  	f2fs_release_ino_entry(sbi, true);
> >>>  
> >>> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
> >>> +
> >>>  	f2fs_leave_shrinker(sbi);
> >>>  	mutex_unlock(&sbi->umount_mutex);
> >>>  
> >>> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> >>>  
> >>>  	f2fs_init_ino_entry_info(sbi);
> >>>  
> >>> +	f2fs_init_fsync_node_info(sbi);
> >>> +
> >>>  	/* setup f2fs internal modules */
> >>>  	err = f2fs_build_segment_manager(sbi);
> >>>  	if (err) {
> >>> -- 
> >>> 2.16.2.17.g38e79b1fd
> > 
> > .
> > 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list
  2018-07-23 12:25       ` Jaegeuk Kim
@ 2018-07-23 13:17         ` Chao Yu
  0 siblings, 0 replies; 8+ messages in thread
From: Chao Yu @ 2018-07-23 13:17 UTC (permalink / raw)
  To: Jaegeuk Kim, Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

On 2018/7/23 20:25, Jaegeuk Kim wrote:
> On 07/20, Chao Yu wrote:
>> Hi Jaegeuk,
>>
>> On 2018/7/15 9:30, Chao Yu wrote:
>>> On 2018/7/14 13:44, Jaegeuk Kim wrote:
>>>> On 07/08, Chao Yu wrote:
>>>>> From: Chao Yu <yuchao0@huawei.com>
>>>>>
>>>>> f2fs recovery flow is relying on dnode block link list, it means fsynced
>>>>> file recovery depends on previous dnode's persistence in the list, so
>>>>> during fsync() we should wait on all regular inode's dnode writebacked
>>>>> before issuing flush.
>>>>>
>>>>> By this way, we can avoid dnode block list being broken by out-of-order
>>>>> IO submission due to IO scheduler or driver.
>>>>
>>>> Hi Chao,
>>>>
>>>> Just in case, can we measure some performance numbers with this?
>>>
>>> Hi Jaegeuk,
>>>
>>> OK, let me do some tests on this.
>>
>> Sheng Yong helps to do the test with this patch:
>>
>> # Microbenchmark
>> Target:/data (f2fs, -)
>> 64MB / 32768KB / 4KB / 8
>>
>> # SQLite benchmark
>> 1 / PERSIST / Index
>>
>> Base:
>> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
>> 1	867.82		204.15		41440.03	41370.54	680.8		1025.94		1031.08
>> 2	871.87		205.87		41370.3		40275.2		791.14		1065.84		1101.7
>> 3	866.52		205.69		41795.67	40596.16	694.69		1037.16		1031.48
>> Avg	868.7366667	205.2366667	41535.33333	40747.3		722.21		1042.98		1054.753333
>> 							
>> After:
>> 	SEQ-RD(MB/s)	SEQ-WR(MB/s)	RND-RD(IOPS)	RND-WR(IOPS)	Insert(TPS)	Update(TPS)	Delete(TPS)
>> 1	798.81		202.5		41143		40613.87	602.71		838.08		913.83
>> 2	805.79		206.47		40297.2		41291.46	604.44		840.75		924.27
>> 3	814.83		206.17		41209.57	40453.62	602.85		834.66		927.91
>> Avg	806.4766667	205.0466667	40883.25667	40786.31667	603.3333333	837.83		922.0033333
>>
>> Orig/Patched:							
>> 	0.928332713	0.999074239	0.984300676	1.000957528	0.835398753	0.803303994	0.874141189
>>
>> It looks like atomic write will suffer performance regression.
>>
>> I suspect that the criminal is that we forcing to wait all dnode being in
>> storage cache before we issue PREFLUSH+FUA.
>>
>> BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
>> cause the problem: we will lose data of last transaction after SPO, even if
>> atomic write return no error:
> 
> Practically, I don't see db corruption becase of this. We can excuse to lose
> the last transaction.

So, let me add these numbers in log of the patch, and change to skip waiting
node page for atomic write.

Thanks,

> 
> Thanks,
> 
>>
>> - atomic_open();
>> - write() P1, P2, P3;
>> - atomic_commit();
>>  - writeback data: P1, P2, P3;
>>  - writeback node: N1, N2, N3;  <--- If N1, N2 is not writebacked, N3 with fsync_mark is
>> writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
>> last transaction.
>>  - preflush + fua;
>> - power-cut
>>
>> Thanks,
>>
>>>
>>> Thanks,
>>>
>>>>
>>>> Thanks,
>>>>
>>>>>
>>>>> Signed-off-by: Chao Yu <yuchao0@huawei.com>
>>>>> ---
>>>>> v3:
>>>>> - add a list to link all writebacked dnodes, let fsync() only wait on
>>>>> necessary dnode.
>>>>>  fs/f2fs/checkpoint.c |   2 +
>>>>>  fs/f2fs/data.c       |   2 +
>>>>>  fs/f2fs/f2fs.h       |  21 +++++++-
>>>>>  fs/f2fs/file.c       |  20 +++----
>>>>>  fs/f2fs/node.c       | 148 +++++++++++++++++++++++++++++++++++++++++----------
>>>>>  fs/f2fs/super.c      |   4 ++
>>>>>  6 files changed, 153 insertions(+), 44 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>>>> index 8b698bd54490..d5e60d76362e 100644
>>>>> --- a/fs/f2fs/checkpoint.c
>>>>> +++ b/fs/f2fs/checkpoint.c
>>>>> @@ -1379,6 +1379,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>  
>>>>>  	f2fs_release_ino_entry(sbi, false);
>>>>>  
>>>>> +	f2fs_reset_fsync_node_info(sbi);
>>>>> +
>>>>>  	if (unlikely(f2fs_cp_error(sbi)))
>>>>>  		return -EIO;
>>>>>  
>>>>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>>>>> index 70813a4dda3e..afe76d87575c 100644
>>>>> --- a/fs/f2fs/data.c
>>>>> +++ b/fs/f2fs/data.c
>>>>> @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio)
>>>>>  					page->index != nid_of_node(page));
>>>>>  
>>>>>  		dec_page_count(sbi, WB_DATA_TYPE(page));
>>>>> +		if (f2fs_in_warm_node_list(sbi, page))
>>>>> +			f2fs_del_fsync_node_entry(sbi, page);
>>>>>  		clear_cold_data(page);
>>>>>  		end_page_writeback(page);
>>>>>  	}
>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>>> index c8c865fa8450..bf5f7a336ace 100644
>>>>> --- a/fs/f2fs/f2fs.h
>>>>> +++ b/fs/f2fs/f2fs.h
>>>>> @@ -228,6 +228,12 @@ struct inode_entry {
>>>>>  	struct inode *inode;	/* vfs inode pointer */
>>>>>  };
>>>>>  
>>>>> +struct fsync_node_entry {
>>>>> +	struct list_head list;	/* list head */
>>>>> +	struct page *page;	/* warm node page pointer */
>>>>> +	unsigned int seq_id;	/* sequence id */
>>>>> +};
>>>>> +
>>>>>  /* for the bitmap indicate blocks to be discarded */
>>>>>  struct discard_entry {
>>>>>  	struct list_head list;	/* list head */
>>>>> @@ -1152,6 +1158,11 @@ struct f2fs_sb_info {
>>>>>  
>>>>>  	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
>>>>>  
>>>>> +	spinlock_t fsync_node_lock;		/* for node entry lock */
>>>>> +	struct list_head fsync_node_list;	/* node list head */
>>>>> +	unsigned int fsync_seg_id;		/* sequence id */
>>>>> +	unsigned int fsync_node_num;		/* number of node entries */
>>>>> +
>>>>>  	/* for orphan inode, use 0'th array */
>>>>>  	unsigned int max_orphans;		/* max orphan inodes */
>>>>>  
>>>>> @@ -2816,6 +2827,10 @@ struct node_info;
>>>>>  
>>>>>  int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
>>>>>  bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
>>>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
>>>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
>>>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
>>>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
>>>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
>>>>>  bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
>>>>>  bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
>>>>> @@ -2825,7 +2840,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
>>>>>  int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
>>>>>  int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
>>>>>  int f2fs_truncate_xattr_node(struct inode *inode);
>>>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
>>>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>>>> +					unsigned int seq_id);
>>>>>  int f2fs_remove_inode_page(struct inode *inode);
>>>>>  struct page *f2fs_new_inode_page(struct inode *inode);
>>>>>  struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
>>>>> @@ -2834,7 +2850,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
>>>>>  struct page *f2fs_get_node_page_ra(struct page *parent, int start);
>>>>>  void f2fs_move_node_page(struct page *node_page, int gc_type);
>>>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>>> -			struct writeback_control *wbc, bool atomic);
>>>>> +			struct writeback_control *wbc, bool atomic,
>>>>> +			unsigned int *seq_id);
>>>>>  int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>>  			struct writeback_control *wbc,
>>>>>  			bool do_balance, enum iostat_type io_type);
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 5e29d4053748..ddea2bfd4042 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>>  		.nr_to_write = LONG_MAX,
>>>>>  		.for_reclaim = 0,
>>>>>  	};
>>>>> +	unsigned int seq_id = 0;
>>>>>  
>>>>>  	if (unlikely(f2fs_readonly(inode->i_sb)))
>>>>>  		return 0;
>>>>> @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>>  	}
>>>>>  sync_nodes:
>>>>>  	atomic_inc(&sbi->wb_sync_req[NODE]);
>>>>> -	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic);
>>>>> +	ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
>>>>>  	atomic_dec(&sbi->wb_sync_req[NODE]);
>>>>>  	if (ret)
>>>>>  		goto out;
>>>>> @@ -292,19 +293,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
>>>>>  		goto sync_nodes;
>>>>>  	}
>>>>>  
>>>>> -	/*
>>>>> -	 * If it's atomic_write, it's just fine to keep write ordering. So
>>>>> -	 * here we don't need to wait for node write completion, since we use
>>>>> -	 * node chain which serializes node blocks. If one of node writes are
>>>>> -	 * reordered, we can see simply broken chain, resulting in stopping
>>>>> -	 * roll-forward recovery. It means we'll recover all or none node blocks
>>>>> -	 * given fsync mark.
>>>>> -	 */
>>>>> -	if (!atomic) {
>>>>> -		ret = f2fs_wait_on_node_pages_writeback(sbi, ino);
>>>>> -		if (ret)
>>>>> -			goto out;
>>>>> -	}
>>>>> +
>>>>> +	ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
>>>>> +	if (ret)
>>>>> +		goto out;
>>>>>  
>>>>>  	/* once recovery info is written, don't need to tack this */
>>>>>  	f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
>>>>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>>>>> index 1e30bc305243..31dc372c56a0 100644
>>>>> --- a/fs/f2fs/node.c
>>>>> +++ b/fs/f2fs/node.c
>>>>> @@ -28,6 +28,7 @@
>>>>>  static struct kmem_cache *nat_entry_slab;
>>>>>  static struct kmem_cache *free_nid_slab;
>>>>>  static struct kmem_cache *nat_entry_set_slab;
>>>>> +static struct kmem_cache *fsync_node_entry_slab;
>>>>>  
>>>>>  /*
>>>>>   * Check whether the given nid is within node id range.
>>>>> @@ -267,6 +268,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
>>>>>  							start, nr);
>>>>>  }
>>>>>  
>>>>> +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
>>>>> +{
>>>>> +	return NODE_MAPPING(sbi) == page->mapping &&
>>>>> +			IS_DNODE(page) && is_cold_node(page);
>>>>> +}
>>>>> +
>>>>> +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
>>>>> +{
>>>>> +	spin_lock_init(&sbi->fsync_node_lock);
>>>>> +	INIT_LIST_HEAD(&sbi->fsync_node_list);
>>>>> +	sbi->fsync_seg_id = 0;
>>>>> +	sbi->fsync_node_num = 0;
>>>>> +}
>>>>> +
>>>>> +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
>>>>> +							struct page *page)
>>>>> +{
>>>>> +	struct fsync_node_entry *fn;
>>>>> +	unsigned long flags;
>>>>> +	unsigned int seq_id;
>>>>> +
>>>>> +	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
>>>>> +
>>>>> +	get_page(page);
>>>>> +	fn->page = page;
>>>>> +	INIT_LIST_HEAD(&fn->list);
>>>>> +
>>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>>> +	list_add_tail(&fn->list, &sbi->fsync_node_list);
>>>>> +	fn->seq_id = sbi->fsync_seg_id++;
>>>>> +	seq_id = fn->seq_id;
>>>>> +	sbi->fsync_node_num++;
>>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +
>>>>> +	return seq_id;
>>>>> +}
>>>>> +
>>>>> +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
>>>>> +{
>>>>> +	struct fsync_node_entry *fn;
>>>>> +	unsigned long flags;
>>>>> +
>>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>>> +	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
>>>>> +		if (fn->page == page) {
>>>>> +			list_del(&fn->list);
>>>>> +			sbi->fsync_node_num--;
>>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +			kmem_cache_free(fsync_node_entry_slab, fn);
>>>>> +			put_page(page);
>>>>> +			return;
>>>>> +		}
>>>>> +	}
>>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +	f2fs_bug_on(sbi, 1);
>>>>> +}
>>>>> +
>>>>> +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
>>>>> +{
>>>>> +	unsigned long flags;
>>>>> +
>>>>> +	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>>> +	sbi->fsync_node_num = 0;
>>>>> +	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +}
>>>>> +
>>>>>  int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
>>>>>  {
>>>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>>> @@ -1353,7 +1420,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
>>>>>  
>>>>>  static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>>  				struct writeback_control *wbc, bool do_balance,
>>>>> -				enum iostat_type io_type)
>>>>> +				enum iostat_type io_type, unsigned int *seq_id)
>>>>>  {
>>>>>  	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
>>>>>  	nid_t nid;
>>>>> @@ -1370,6 +1437,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>>  		.io_type = io_type,
>>>>>  		.io_wbc = wbc,
>>>>>  	};
>>>>> +	unsigned int seq;
>>>>>  
>>>>>  	trace_f2fs_writepage(page, NODE);
>>>>>  
>>>>> @@ -1416,6 +1484,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
>>>>>  
>>>>>  	set_page_writeback(page);
>>>>>  	ClearPageError(page);
>>>>> +
>>>>> +	if (f2fs_in_warm_node_list(sbi, page)) {
>>>>> +		seq = f2fs_add_fsync_node_entry(sbi, page);
>>>>> +		if (seq_id)
>>>>> +			*seq_id = seq;
>>>>> +	}
>>>>> +
>>>>>  	fio.old_blkaddr = ni.blk_addr;
>>>>>  	f2fs_do_write_node_page(nid, &fio);
>>>>>  	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
>>>>> @@ -1463,7 +1538,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>>>  			goto out_page;
>>>>>  
>>>>>  		if (__write_node_page(node_page, false, NULL,
>>>>> -					&wbc, false, FS_GC_NODE_IO))
>>>>> +					&wbc, false, FS_GC_NODE_IO, NULL))
>>>>>  			unlock_page(node_page);
>>>>>  		goto release_page;
>>>>>  	} else {
>>>>> @@ -1480,11 +1555,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
>>>>>  static int f2fs_write_node_page(struct page *page,
>>>>>  				struct writeback_control *wbc)
>>>>>  {
>>>>> -	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
>>>>> +	return __write_node_page(page, false, NULL, wbc, false,
>>>>> +						FS_NODE_IO, NULL);
>>>>>  }
>>>>>  
>>>>>  int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>>> -			struct writeback_control *wbc, bool atomic)
>>>>> +			struct writeback_control *wbc, bool atomic,
>>>>> +			unsigned int *seq_id)
>>>>>  {
>>>>>  	pgoff_t index;
>>>>>  	pgoff_t last_idx = ULONG_MAX;
>>>>> @@ -1565,7 +1642,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
>>>>>  			ret = __write_node_page(page, atomic &&
>>>>>  						page == last_page,
>>>>>  						&submitted, wbc, true,
>>>>> -						FS_NODE_IO);
>>>>> +						FS_NODE_IO, seq_id);
>>>>>  			if (ret) {
>>>>>  				unlock_page(page);
>>>>>  				f2fs_put_page(last_page, 0);
>>>>> @@ -1682,7 +1759,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>>  			set_dentry_mark(page, 0);
>>>>>  
>>>>>  			ret = __write_node_page(page, false, &submitted,
>>>>> -						wbc, do_balance, io_type);
>>>>> +						wbc, do_balance, io_type, NULL);
>>>>>  			if (ret)
>>>>>  				unlock_page(page);
>>>>>  			else if (submitted)
>>>>> @@ -1713,35 +1790,42 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
>>>>>  	return ret;
>>>>>  }
>>>>>  
>>>>> -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
>>>>> +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
>>>>> +						unsigned int seq_id)
>>>>>  {
>>>>> -	pgoff_t index = 0;
>>>>> -	struct pagevec pvec;
>>>>> -	int ret2, ret = 0;
>>>>> -	int nr_pages;
>>>>> +	struct fsync_node_entry *fn;
>>>>> +	struct page *page;
>>>>> +	struct list_head *head = &sbi->fsync_node_list;
>>>>> +	unsigned long flags;
>>>>> +	unsigned int cur_seq_id = 0;
>>>>> +	int ret = 0;
>>>>>  
>>>>> -	pagevec_init(&pvec);
>>>>> +	while (seq_id && cur_seq_id < seq_id) {
>>>>> +		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
>>>>> +		if (list_empty(head)) {
>>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +			break;
>>>>> +		}
>>>>> +		fn = list_first_entry(head, struct fsync_node_entry, list);
>>>>> +		if (fn->seq_id > seq_id) {
>>>>> +			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>> +			break;
>>>>> +		}
>>>>> +		cur_seq_id = fn->seq_id;
>>>>> +		page = fn->page;
>>>>> +		get_page(page);
>>>>> +		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
>>>>>  
>>>>> -	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
>>>>> -				PAGECACHE_TAG_WRITEBACK))) {
>>>>> -		int i;
>>>>> +		f2fs_wait_on_page_writeback(page, NODE, true);
>>>>> +		if (TestClearPageError(page))
>>>>> +			ret = -EIO;
>>>>>  
>>>>> -		for (i = 0; i < nr_pages; i++) {
>>>>> -			struct page *page = pvec.pages[i];
>>>>> +		put_page(page);
>>>>>  
>>>>> -			if (ino && ino_of_node(page) == ino) {
>>>>> -				f2fs_wait_on_page_writeback(page, NODE, true);
>>>>> -				if (TestClearPageError(page))
>>>>> -					ret = -EIO;
>>>>> -			}
>>>>> -		}
>>>>> -		pagevec_release(&pvec);
>>>>> -		cond_resched();
>>>>> +		if (ret)
>>>>> +			break;
>>>>>  	}
>>>>>  
>>>>> -	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
>>>>> -	if (!ret)
>>>>> -		ret = ret2;
>>>>>  	return ret;
>>>>>  }
>>>>>  
>>>>> @@ -2939,8 +3023,15 @@ int __init f2fs_create_node_manager_caches(void)
>>>>>  			sizeof(struct nat_entry_set));
>>>>>  	if (!nat_entry_set_slab)
>>>>>  		goto destroy_free_nid;
>>>>> +
>>>>> +	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
>>>>> +			sizeof(struct fsync_node_entry));
>>>>> +	if (!fsync_node_entry_slab)
>>>>> +		goto destroy_nat_entry_set;
>>>>>  	return 0;
>>>>>  
>>>>> +destroy_nat_entry_set:
>>>>> +	kmem_cache_destroy(nat_entry_set_slab);
>>>>>  destroy_free_nid:
>>>>>  	kmem_cache_destroy(free_nid_slab);
>>>>>  destroy_nat_entry:
>>>>> @@ -2951,6 +3042,7 @@ int __init f2fs_create_node_manager_caches(void)
>>>>>  
>>>>>  void f2fs_destroy_node_manager_caches(void)
>>>>>  {
>>>>> +	kmem_cache_destroy(fsync_node_entry_slab);
>>>>>  	kmem_cache_destroy(nat_entry_set_slab);
>>>>>  	kmem_cache_destroy(free_nid_slab);
>>>>>  	kmem_cache_destroy(nat_entry_slab);
>>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>>> index 143ed321076e..34321932754d 100644
>>>>> --- a/fs/f2fs/super.c
>>>>> +++ b/fs/f2fs/super.c
>>>>> @@ -1023,6 +1023,8 @@ static void f2fs_put_super(struct super_block *sb)
>>>>>  	 */
>>>>>  	f2fs_release_ino_entry(sbi, true);
>>>>>  
>>>>> +	f2fs_bug_on(sbi, sbi->fsync_node_num);
>>>>> +
>>>>>  	f2fs_leave_shrinker(sbi);
>>>>>  	mutex_unlock(&sbi->umount_mutex);
>>>>>  
>>>>> @@ -2903,6 +2905,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>>>>>  
>>>>>  	f2fs_init_ino_entry_info(sbi);
>>>>>  
>>>>> +	f2fs_init_fsync_node_info(sbi);
>>>>> +
>>>>>  	/* setup f2fs internal modules */
>>>>>  	err = f2fs_build_segment_manager(sbi);
>>>>>  	if (err) {
>>>>> -- 
>>>>> 2.16.2.17.g38e79b1fd
>>>
>>> .
>>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-07-23 13:17 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-08 14:04 [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Chao Yu
2018-07-08 14:04 ` [PATCH v3 2/2] f2fs: let checkpoint flush dnode page of regular Chao Yu
2018-07-14  5:44 ` [PATCH v3 1/2] f2fs: fix to avoid broken of dnode block list Jaegeuk Kim
2018-07-15  1:30   ` Chao Yu
2018-07-20 10:45     ` Chao Yu
2018-07-21 10:29       ` Chao Yu
2018-07-23 12:25       ` Jaegeuk Kim
2018-07-23 13:17         ` Chao Yu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).