All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-19 23:29 ` Weichao Guo
@ 2018-01-19 15:47   ` guoweichao
  -1 siblings, 0 replies; 20+ messages in thread
From: guoweichao @ 2018-01-19 15:47 UTC (permalink / raw)
  To: jaegeuk, chao; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei


A critical case is using the free segments as data segments which
are previously node segments to the old checkpoint. With fault injecting
of the newer CP pack, fsck can find errors when checking the sanity of nid.

On 2018/1/20 7:29, Weichao Guo wrote:
> Currently, we set prefree segments as free ones after writing
> a checkpoint, then believe the segments could be used safely.
> However, if a sudden power-off coming and the newer checkpoint
> corrupted due to hardware issues at the same time, we will try
> to use the old checkpoint and may face an inconsistent file
> system status.
> 
> How about add an PRE2 status for prefree segments, and make
> sure the segments could be used safely to both checkpoints?
> Or any better solutions? Or this is not a problem?
> 
> Look forward to your comments!
> 
> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
> ---
>  fs/f2fs/gc.c      | 11 +++++++++--
>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>  fs/f2fs/segment.h |  6 ++++++
>  3 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> index 33e7969..153e3ea 100644
> --- a/fs/f2fs/gc.c
> +++ b/fs/f2fs/gc.c
> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  		 * threshold, we can make them free by checkpoint. Then, we
>  		 * secure free segments which doesn't need fggc any more.
>  		 */
> -		if (prefree_segments(sbi)) {
> +		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
> +			ret = write_checkpoint(sbi, &cpc);
> +			if (ret)
> +				goto stop;
> +		}
> +		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>  			ret = write_checkpoint(sbi, &cpc);
>  			if (ret)
>  				goto stop;
> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  			goto gc_more;
>  		}
>  
> -		if (gc_type == FG_GC)
> +		if (gc_type == FG_GC) {
> +			ret = write_checkpoint(sbi, &cpc);
>  			ret = write_checkpoint(sbi, &cpc);
> +		}
>  	}
>  stop:
>  	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 2e8e054d..9dec445 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>  	unsigned int segno;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
> -	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>  		__set_test_and_free(sbi, segno);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  }
> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	struct list_head *head = &dcc->entry_list;
>  	struct discard_entry *entry, *this;
>  	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
> -	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
> +	unsigned long *prefree_map;
>  	unsigned int start = 0, end = -1;
>  	unsigned int secno, start_segno;
>  	bool force = (cpc->reason & CP_DISCARD);
> +	int phase = 0;
> +	enum dirty_type dirty_type = PRE2;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  
> +next_step:
> +	prefree_map = dirty_i->dirty_segmap[dirty_type];
>  	while (1) {
>  		int i;
>  		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		for (i = start; i < end; i++)
>  			clear_bit(i, prefree_map);
>  
> -		dirty_i->nr_dirty[PRE] -= end - start;
> +		dirty_i->nr_dirty[dirty_type] -= end - start;
>  
>  		if (!test_opt(sbi, DISCARD))
>  			continue;
> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		else
>  			end = start - 1;
>  	}
> +	if (phase == 0) {
> +		/* status change: PRE -> PRE2 */
> +		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +			if (!test_and_set_bit(segno, prefree_map))
> +				dirty_i->nr_dirty[PRE2]++;
> +
> +		phase = 1;
> +		dirty_type = PRE;
> +		goto next_step;
> +	}
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
>  	/* send small discards */
> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  	__remove_dirty_segment(sbi, new_segno, PRE);
> +	__remove_dirty_segment(sbi, new_segno, PRE2);
>  	__remove_dirty_segment(sbi, new_segno, DIRTY);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 71a2aaa..f702726 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -263,6 +263,7 @@ enum dirty_type {
>  	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
>  	DIRTY,			/* to count # of dirty segments */
>  	PRE,			/* to count # of entirely obsolete segments */
> +	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>  	NR_DIRTY_TYPE
>  };
>  
> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>  	return DIRTY_I(sbi)->nr_dirty[PRE];
>  }
>  
> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
> +{
> +	return DIRTY_I(sbi)->nr_dirty[PRE2];
> +}
> +
>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>  {
>  	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-19 15:47   ` guoweichao
  0 siblings, 0 replies; 20+ messages in thread
From: guoweichao @ 2018-01-19 15:47 UTC (permalink / raw)
  To: jaegeuk, chao; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei


A critical case is using the free segments as data segments which
are previously node segments to the old checkpoint. With fault injecting
of the newer CP pack, fsck can find errors when checking the sanity of nid.

On 2018/1/20 7:29, Weichao Guo wrote:
> Currently, we set prefree segments as free ones after writing
> a checkpoint, then believe the segments could be used safely.
> However, if a sudden power-off coming and the newer checkpoint
> corrupted due to hardware issues at the same time, we will try
> to use the old checkpoint and may face an inconsistent file
> system status.
> 
> How about add an PRE2 status for prefree segments, and make
> sure the segments could be used safely to both checkpoints?
> Or any better solutions? Or this is not a problem?
> 
> Look forward to your comments!
> 
> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
> ---
>  fs/f2fs/gc.c      | 11 +++++++++--
>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>  fs/f2fs/segment.h |  6 ++++++
>  3 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> index 33e7969..153e3ea 100644
> --- a/fs/f2fs/gc.c
> +++ b/fs/f2fs/gc.c
> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  		 * threshold, we can make them free by checkpoint. Then, we
>  		 * secure free segments which doesn't need fggc any more.
>  		 */
> -		if (prefree_segments(sbi)) {
> +		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
> +			ret = write_checkpoint(sbi, &cpc);
> +			if (ret)
> +				goto stop;
> +		}
> +		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>  			ret = write_checkpoint(sbi, &cpc);
>  			if (ret)
>  				goto stop;
> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  			goto gc_more;
>  		}
>  
> -		if (gc_type == FG_GC)
> +		if (gc_type == FG_GC) {
> +			ret = write_checkpoint(sbi, &cpc);
>  			ret = write_checkpoint(sbi, &cpc);
> +		}
>  	}
>  stop:
>  	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 2e8e054d..9dec445 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>  	unsigned int segno;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
> -	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>  		__set_test_and_free(sbi, segno);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  }
> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	struct list_head *head = &dcc->entry_list;
>  	struct discard_entry *entry, *this;
>  	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
> -	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
> +	unsigned long *prefree_map;
>  	unsigned int start = 0, end = -1;
>  	unsigned int secno, start_segno;
>  	bool force = (cpc->reason & CP_DISCARD);
> +	int phase = 0;
> +	enum dirty_type dirty_type = PRE2;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  
> +next_step:
> +	prefree_map = dirty_i->dirty_segmap[dirty_type];
>  	while (1) {
>  		int i;
>  		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		for (i = start; i < end; i++)
>  			clear_bit(i, prefree_map);
>  
> -		dirty_i->nr_dirty[PRE] -= end - start;
> +		dirty_i->nr_dirty[dirty_type] -= end - start;
>  
>  		if (!test_opt(sbi, DISCARD))
>  			continue;
> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		else
>  			end = start - 1;
>  	}
> +	if (phase == 0) {
> +		/* status change: PRE -> PRE2 */
> +		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +			if (!test_and_set_bit(segno, prefree_map))
> +				dirty_i->nr_dirty[PRE2]++;
> +
> +		phase = 1;
> +		dirty_type = PRE;
> +		goto next_step;
> +	}
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
>  	/* send small discards */
> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  	__remove_dirty_segment(sbi, new_segno, PRE);
> +	__remove_dirty_segment(sbi, new_segno, PRE2);
>  	__remove_dirty_segment(sbi, new_segno, DIRTY);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 71a2aaa..f702726 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -263,6 +263,7 @@ enum dirty_type {
>  	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
>  	DIRTY,			/* to count # of dirty segments */
>  	PRE,			/* to count # of entirely obsolete segments */
> +	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>  	NR_DIRTY_TYPE
>  };
>  
> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>  	return DIRTY_I(sbi)->nr_dirty[PRE];
>  }
>  
> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
> +{
> +	return DIRTY_I(sbi)->nr_dirty[PRE2];
> +}
> +
>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>  {
>  	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-19 23:29 ` Weichao Guo
  0 siblings, 0 replies; 20+ messages in thread
From: Weichao Guo @ 2018-01-19 23:29 UTC (permalink / raw)
  To: jaegeuk, chao; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei, Weichao Guo

Currently, we set prefree segments as free ones after writing
a checkpoint, then believe the segments could be used safely.
However, if a sudden power-off coming and the newer checkpoint
corrupted due to hardware issues at the same time, we will try
to use the old checkpoint and may face an inconsistent file
system status.

How about add an PRE2 status for prefree segments, and make
sure the segments could be used safely to both checkpoints?
Or any better solutions? Or this is not a problem?

Look forward to your comments!

Signed-off-by: Weichao Guo <guoweichao@huawei.com>
---
 fs/f2fs/gc.c      | 11 +++++++++--
 fs/f2fs/segment.c | 21 ++++++++++++++++++---
 fs/f2fs/segment.h |  6 ++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 33e7969..153e3ea 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 		 * threshold, we can make them free by checkpoint. Then, we
 		 * secure free segments which doesn't need fggc any more.
 		 */
-		if (prefree_segments(sbi)) {
+		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
+			ret = write_checkpoint(sbi, &cpc);
+			if (ret)
+				goto stop;
+		}
+		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
 			ret = write_checkpoint(sbi, &cpc);
 			if (ret)
 				goto stop;
@@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 			goto gc_more;
 		}
 
-		if (gc_type == FG_GC)
+		if (gc_type == FG_GC) {
+			ret = write_checkpoint(sbi, &cpc);
 			ret = write_checkpoint(sbi, &cpc);
+		}
 	}
 stop:
 	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2e8e054d..9dec445 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 	unsigned int segno;
 
 	mutex_lock(&dirty_i->seglist_lock);
-	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
+	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
 		__set_test_and_free(sbi, segno);
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct list_head *head = &dcc->entry_list;
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
+	unsigned long *prefree_map;
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason & CP_DISCARD);
+	int phase = 0;
+	enum dirty_type dirty_type = PRE2;
 
 	mutex_lock(&dirty_i->seglist_lock);
 
+next_step:
+	prefree_map = dirty_i->dirty_segmap[dirty_type];
 	while (1) {
 		int i;
 		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
@@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		for (i = start; i < end; i++)
 			clear_bit(i, prefree_map);
 
-		dirty_i->nr_dirty[PRE] -= end - start;
+		dirty_i->nr_dirty[dirty_type] -= end - start;
 
 		if (!test_opt(sbi, DISCARD))
 			continue;
@@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		else
 			end = start - 1;
 	}
+	if (phase == 0) {
+		/* status change: PRE -> PRE2 */
+		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
+			if (!test_and_set_bit(segno, prefree_map))
+				dirty_i->nr_dirty[PRE2]++;
+
+		phase = 1;
+		dirty_type = PRE;
+		goto next_step;
+	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	/* send small discards */
@@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
 
 	mutex_lock(&dirty_i->seglist_lock);
 	__remove_dirty_segment(sbi, new_segno, PRE);
+	__remove_dirty_segment(sbi, new_segno, PRE2);
 	__remove_dirty_segment(sbi, new_segno, DIRTY);
 	mutex_unlock(&dirty_i->seglist_lock);
 
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 71a2aaa..f702726 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -263,6 +263,7 @@ enum dirty_type {
 	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
 	DIRTY,			/* to count # of dirty segments */
 	PRE,			/* to count # of entirely obsolete segments */
+	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
 	NR_DIRTY_TYPE
 };
 
@@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
 	return DIRTY_I(sbi)->nr_dirty[PRE];
 }
 
+static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
+{
+	return DIRTY_I(sbi)->nr_dirty[PRE2];
+}
+
 static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
 {
 	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-19 23:29 ` Weichao Guo
  0 siblings, 0 replies; 20+ messages in thread
From: Weichao Guo @ 2018-01-19 23:29 UTC (permalink / raw)
  To: jaegeuk, chao; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei, Weichao Guo

Currently, we set prefree segments as free ones after writing
a checkpoint, then believe the segments could be used safely.
However, if a sudden power-off coming and the newer checkpoint
corrupted due to hardware issues at the same time, we will try
to use the old checkpoint and may face an inconsistent file
system status.

How about add an PRE2 status for prefree segments, and make
sure the segments could be used safely to both checkpoints?
Or any better solutions? Or this is not a problem?

Look forward to your comments!

Signed-off-by: Weichao Guo <guoweichao@huawei.com>
---
 fs/f2fs/gc.c      | 11 +++++++++--
 fs/f2fs/segment.c | 21 ++++++++++++++++++---
 fs/f2fs/segment.h |  6 ++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 33e7969..153e3ea 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 		 * threshold, we can make them free by checkpoint. Then, we
 		 * secure free segments which doesn't need fggc any more.
 		 */
-		if (prefree_segments(sbi)) {
+		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
+			ret = write_checkpoint(sbi, &cpc);
+			if (ret)
+				goto stop;
+		}
+		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
 			ret = write_checkpoint(sbi, &cpc);
 			if (ret)
 				goto stop;
@@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 			goto gc_more;
 		}
 
-		if (gc_type == FG_GC)
+		if (gc_type == FG_GC) {
+			ret = write_checkpoint(sbi, &cpc);
 			ret = write_checkpoint(sbi, &cpc);
+		}
 	}
 stop:
 	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2e8e054d..9dec445 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 	unsigned int segno;
 
 	mutex_lock(&dirty_i->seglist_lock);
-	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
+	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
 		__set_test_and_free(sbi, segno);
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct list_head *head = &dcc->entry_list;
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
+	unsigned long *prefree_map;
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason & CP_DISCARD);
+	int phase = 0;
+	enum dirty_type dirty_type = PRE2;
 
 	mutex_lock(&dirty_i->seglist_lock);
 
+next_step:
+	prefree_map = dirty_i->dirty_segmap[dirty_type];
 	while (1) {
 		int i;
 		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
@@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		for (i = start; i < end; i++)
 			clear_bit(i, prefree_map);
 
-		dirty_i->nr_dirty[PRE] -= end - start;
+		dirty_i->nr_dirty[dirty_type] -= end - start;
 
 		if (!test_opt(sbi, DISCARD))
 			continue;
@@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		else
 			end = start - 1;
 	}
+	if (phase == 0) {
+		/* status change: PRE -> PRE2 */
+		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
+			if (!test_and_set_bit(segno, prefree_map))
+				dirty_i->nr_dirty[PRE2]++;
+
+		phase = 1;
+		dirty_type = PRE;
+		goto next_step;
+	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	/* send small discards */
@@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
 
 	mutex_lock(&dirty_i->seglist_lock);
 	__remove_dirty_segment(sbi, new_segno, PRE);
+	__remove_dirty_segment(sbi, new_segno, PRE2);
 	__remove_dirty_segment(sbi, new_segno, DIRTY);
 	mutex_unlock(&dirty_i->seglist_lock);
 
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 71a2aaa..f702726 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -263,6 +263,7 @@ enum dirty_type {
 	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
 	DIRTY,			/* to count # of dirty segments */
 	PRE,			/* to count # of entirely obsolete segments */
+	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
 	NR_DIRTY_TYPE
 };
 
@@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
 	return DIRTY_I(sbi)->nr_dirty[PRE];
 }
 
+static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
+{
+	return DIRTY_I(sbi)->nr_dirty[PRE2];
+}
+
 static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
 {
 	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-19 15:47   ` guoweichao
  (?)
@ 2018-01-20  3:48   ` Gao Xiang via Linux-f2fs-devel
  2018-01-21  3:27     ` Gao Xiang via Linux-f2fs-devel
  -1 siblings, 1 reply; 20+ messages in thread
From: Gao Xiang via Linux-f2fs-devel @ 2018-01-20  3:48 UTC (permalink / raw)
  To: guoweichao; +Cc: heyunlei, linux-f2fs-devel

Hi Weichao,


On 2018/1/19 23:47, guoweichao wrote:
> A critical case is using the free segments as data segments which
> are previously node segments to the old checkpoint. With fault injecting
> of the newer CP pack, fsck can find errors when checking the sanity of nid.
Sorry to interrupt because I'm just curious about this scenario and the 
detail.

As far as I know, if the whole blocks in a segment become invalid,
the segment will become PREFREE, and then if a checkpoint is followed, 
we can reuse this segment or
discard the whole segment safely after this checkpoint was done
(I think It makes sure that this segment is certainly FREE and not 
reused in this checkpoint).

If the segment in the old checkpoint is a node segment, and node blocks 
in the segment are all invalid until the new checkpoint.
It seems no danger to reuse the FREE node segment as a data segment in 
the next checkpoint?

or something related to SPOR? In my mind f2fs-tools ignores POR node 
chain...

Thanks,
> On 2018/1/20 7:29, Weichao Guo wrote:
>> Currently, we set prefree segments as free ones after writing
>> a checkpoint, then believe the segments could be used safely.
>> However, if a sudden power-off coming and the newer checkpoint
>> corrupted due to hardware issues at the same time, we will try
>> to use the old checkpoint and may face an inconsistent file
>> system status.
>>
>> How about add an PRE2 status for prefree segments, and make
>> sure the segments could be used safely to both checkpoints?
>> Or any better solutions? Or this is not a problem?
>>
>> Look forward to your comments!
>>
>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>> ---
>>   fs/f2fs/gc.c      | 11 +++++++++--
>>   fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>   fs/f2fs/segment.h |  6 ++++++
>>   3 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index 33e7969..153e3ea 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>   		 * threshold, we can make them free by checkpoint. Then, we
>>   		 * secure free segments which doesn't need fggc any more.
>>   		 */
>> -		if (prefree_segments(sbi)) {
>> +		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>> +			ret = write_checkpoint(sbi, &cpc);
>> +			if (ret)
>> +				goto stop;
>> +		}
>> +		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>   			ret = write_checkpoint(sbi, &cpc);
>>   			if (ret)
>>   				goto stop;
>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>   			goto gc_more;
>>   		}
>>   
>> -		if (gc_type == FG_GC)
>> +		if (gc_type == FG_GC) {
>> +			ret = write_checkpoint(sbi, &cpc);
>>   			ret = write_checkpoint(sbi, &cpc);
>> +		}
>>   	}
>>   stop:
>>   	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> index 2e8e054d..9dec445 100644
>> --- a/fs/f2fs/segment.c
>> +++ b/fs/f2fs/segment.c
>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>   	unsigned int segno;
>>   
>>   	mutex_lock(&dirty_i->seglist_lock);
>> -	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> +	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>   		__set_test_and_free(sbi, segno);
>>   	mutex_unlock(&dirty_i->seglist_lock);
>>   }
>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   	struct list_head *head = &dcc->entry_list;
>>   	struct discard_entry *entry, *this;
>>   	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>> -	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>> +	unsigned long *prefree_map;
>>   	unsigned int start = 0, end = -1;
>>   	unsigned int secno, start_segno;
>>   	bool force = (cpc->reason & CP_DISCARD);
>> +	int phase = 0;
>> +	enum dirty_type dirty_type = PRE2;
>>   
>>   	mutex_lock(&dirty_i->seglist_lock);
>>   
>> +next_step:
>> +	prefree_map = dirty_i->dirty_segmap[dirty_type];
>>   	while (1) {
>>   		int i;
>>   		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   		for (i = start; i < end; i++)
>>   			clear_bit(i, prefree_map);
>>   
>> -		dirty_i->nr_dirty[PRE] -= end - start;
>> +		dirty_i->nr_dirty[dirty_type] -= end - start;
>>   
>>   		if (!test_opt(sbi, DISCARD))
>>   			continue;
>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   		else
>>   			end = start - 1;
>>   	}
>> +	if (phase == 0) {
>> +		/* status change: PRE -> PRE2 */
>> +		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> +			if (!test_and_set_bit(segno, prefree_map))
>> +				dirty_i->nr_dirty[PRE2]++;
>> +
>> +		phase = 1;
>> +		dirty_type = PRE;
>> +		goto next_step;
>> +	}
>>   	mutex_unlock(&dirty_i->seglist_lock);
>>   
>>   	/* send small discards */
>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>   
>>   	mutex_lock(&dirty_i->seglist_lock);
>>   	__remove_dirty_segment(sbi, new_segno, PRE);
>> +	__remove_dirty_segment(sbi, new_segno, PRE2);
>>   	__remove_dirty_segment(sbi, new_segno, DIRTY);
>>   	mutex_unlock(&dirty_i->seglist_lock);
>>   
>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>> index 71a2aaa..f702726 100644
>> --- a/fs/f2fs/segment.h
>> +++ b/fs/f2fs/segment.h
>> @@ -263,6 +263,7 @@ enum dirty_type {
>>   	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
>>   	DIRTY,			/* to count # of dirty segments */
>>   	PRE,			/* to count # of entirely obsolete segments */
>> +	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>   	NR_DIRTY_TYPE
>>   };
>>   
>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>   	return DIRTY_I(sbi)->nr_dirty[PRE];
>>   }
>>   
>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>> +{
>> +	return DIRTY_I(sbi)->nr_dirty[PRE2];
>> +}
>> +
>>   static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>   {
>>   	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-19 23:29 ` Weichao Guo
  (?)
  (?)
@ 2018-01-20  7:43 ` Chao Yu
  2018-01-21  2:34     ` Chao Yu
  -1 siblings, 1 reply; 20+ messages in thread
From: Chao Yu @ 2018-01-20  7:43 UTC (permalink / raw)
  To: Weichao Guo, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

Hi Weichao,

On 2018/1/20 7:29, Weichao Guo wrote:
> Currently, we set prefree segments as free ones after writing
> a checkpoint, then believe the segments could be used safely.
> However, if a sudden power-off coming and the newer checkpoint
> corrupted due to hardware issues at the same time, we will try
> to use the old checkpoint and may face an inconsistent file
> system status.

IIUC, you mean:

1. write nodes into segment x;
2. write checkpoint A;
3. remove nodes in segment x;
4. write checkpoint B;
5. issue discard or write datas into segment x;
6. sudden power-cut

But after reboot, we found checkpoint B is corrupted due to hardware, and
then start to use checkpoint A, but nodes in segment x recorded as valid
data in checkpoint A has been overcovered in step 5), so we will encounter
inconsistent meta data, right?

Thanks,

> 
> How about add an PRE2 status for prefree segments, and make
> sure the segments could be used safely to both checkpoints?
> Or any better solutions? Or this is not a problem?
> 
> Look forward to your comments!
> 
> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
> ---
>  fs/f2fs/gc.c      | 11 +++++++++--
>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>  fs/f2fs/segment.h |  6 ++++++
>  3 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> index 33e7969..153e3ea 100644
> --- a/fs/f2fs/gc.c
> +++ b/fs/f2fs/gc.c
> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  		 * threshold, we can make them free by checkpoint. Then, we
>  		 * secure free segments which doesn't need fggc any more.
>  		 */
> -		if (prefree_segments(sbi)) {
> +		if (prefree_segments(sbi) || prefree2_segments(sbi)) {
> +			ret = write_checkpoint(sbi, &cpc);
> +			if (ret)
> +				goto stop;
> +		}
> +		if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>  			ret = write_checkpoint(sbi, &cpc);
>  			if (ret)
>  				goto stop;
> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>  			goto gc_more;
>  		}
>  
> -		if (gc_type == FG_GC)
> +		if (gc_type == FG_GC) {
> +			ret = write_checkpoint(sbi, &cpc);
>  			ret = write_checkpoint(sbi, &cpc);
> +		}
>  	}
>  stop:
>  	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 2e8e054d..9dec445 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>  	unsigned int segno;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
> -	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>  		__set_test_and_free(sbi, segno);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  }
> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	struct list_head *head = &dcc->entry_list;
>  	struct discard_entry *entry, *this;
>  	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
> -	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
> +	unsigned long *prefree_map;
>  	unsigned int start = 0, end = -1;
>  	unsigned int secno, start_segno;
>  	bool force = (cpc->reason & CP_DISCARD);
> +	int phase = 0;
> +	enum dirty_type dirty_type = PRE2;
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  
> +next_step:
> +	prefree_map = dirty_i->dirty_segmap[dirty_type];
>  	while (1) {
>  		int i;
>  		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		for (i = start; i < end; i++)
>  			clear_bit(i, prefree_map);
>  
> -		dirty_i->nr_dirty[PRE] -= end - start;
> +		dirty_i->nr_dirty[dirty_type] -= end - start;
>  
>  		if (!test_opt(sbi, DISCARD))
>  			continue;
> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		else
>  			end = start - 1;
>  	}
> +	if (phase == 0) {
> +		/* status change: PRE -> PRE2 */
> +		for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
> +			if (!test_and_set_bit(segno, prefree_map))
> +				dirty_i->nr_dirty[PRE2]++;
> +
> +		phase = 1;
> +		dirty_type = PRE;
> +		goto next_step;
> +	}
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
>  	/* send small discards */
> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>  
>  	mutex_lock(&dirty_i->seglist_lock);
>  	__remove_dirty_segment(sbi, new_segno, PRE);
> +	__remove_dirty_segment(sbi, new_segno, PRE2);
>  	__remove_dirty_segment(sbi, new_segno, DIRTY);
>  	mutex_unlock(&dirty_i->seglist_lock);
>  
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 71a2aaa..f702726 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -263,6 +263,7 @@ enum dirty_type {
>  	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
>  	DIRTY,			/* to count # of dirty segments */
>  	PRE,			/* to count # of entirely obsolete segments */
> +	PRE2,			/* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>  	NR_DIRTY_TYPE
>  };
>  
> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>  	return DIRTY_I(sbi)->nr_dirty[PRE];
>  }
>  
> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
> +{
> +	return DIRTY_I(sbi)->nr_dirty[PRE2];
> +}
> +
>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>  {
>  	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-20  7:43 ` Chao Yu
@ 2018-01-21  2:34     ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-21  2:34 UTC (permalink / raw)
  To: guoweichao, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

Hi Weichao,

On 2018/1/20 23:50, guoweichao wrote:
> Hi Chao,
> 
> Yes, it is exactly what I mean.
> It seems that F2FS has no responsibility to cover hardware problems.
> However, file systems usually choose redundancy for super block fault tolerance.
> So I think we actually have considered some external errors when designing a file system.
> Our dual checkpoint mechanism is mainly designed to keep at least one stable
> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
> As CP pack is also very critical to F2FS, why not make checkpoint more robustness

I think you're trying to change basic design of A/B upgrade system to A/B/C one,
which can keep always two checkpoint valid. There will be no simple modification
to implement that one, in where we should cover not only prefree case but also
SSR case.

IMO, the biggest problem there is available space, since in checkpoint C, we can
only use invalid blocks in both checkpoint A and B, so in some cases there will
almost be no valid space we can use during allocation, result in frequently
checkpoint.

IMO, what we can do is trying to keep last valid checkpoint being integrity as
possible as we can. One way is that we can add mirror or parity for the
checkpoint which can help to do recovery once checkpoint is corrupted. At
least, I hope that with it in debug version we can help hardware staff to fix
their issue instead of wasting much time to troubleshoot filesystem issue.

Thanks,

> with simple modification in current design and little overhead except for FG_GC.
> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
> 
> Thanks,
> *From:*Chao Yu
> *To:*guoweichao,jaegeuk@kernel.org,
> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
> *Date:*2018-01-20 15:43:23
> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
> 
> Hi Weichao,
> 
> On 2018/1/20 7:29, Weichao Guo wrote:
>> Currently, we set prefree segments as free ones after writing
>> a checkpoint, then believe the segments could be used safely.
>> However, if a sudden power-off coming and the newer checkpoint
>> corrupted due to hardware issues at the same time, we will try
>> to use the old checkpoint and may face an inconsistent file
>> system status.
> 
> IIUC, you mean:
> 
> 1. write nodes into segment x;
> 2. write checkpoint A;
> 3. remove nodes in segment x;
> 4. write checkpoint B;
> 5. issue discard or write datas into segment x;
> 6. sudden power-cut
> 
> But after reboot, we found checkpoint B is corrupted due to hardware, and
> then start to use checkpoint A, but nodes in segment x recorded as valid
> data in checkpoint A has been overcovered in step 5), so we will encounter
> inconsistent meta data, right?
> 
> Thanks,
> 
>>
>> How about add an PRE2 status for prefree segments, and make
>> sure the segments could be used safely to both checkpoints?
>> Or any better solutions? Or this is not a problem?
>>
>> Look forward to your comments!
>>
>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>> ---
>> �fs/f2fs/gc.c �����| 11 +++++++++--
>> �fs/f2fs/segment.c | 21 ++++++++++++++++++---
>> �fs/f2fs/segment.h | �6 ++++++
>> �3 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index 33e7969..153e3ea 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>> � �* threshold, we can make them free by checkpoint. Then, we
>> � �* secure free segments which doesn't need fggc any more.
>> � �*/
>> - if (prefree_segments(sbi)) {
>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>> + ret = write_checkpoint(sbi, &cpc);
>> + if (ret)
>> + goto stop;
>> + }
>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>> � ret = write_checkpoint(sbi, &cpc);
>> � if (ret)
>> � goto stop;
>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>> � goto gc_more;
>> � }
>> �
>> - if (gc_type == FG_GC)
>> + if (gc_type == FG_GC) {
>> + ret = write_checkpoint(sbi, &cpc);
>> � ret = write_checkpoint(sbi, &cpc);
>> + }
>> � }
>> �stop:
>> � SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> index 2e8e054d..9dec445 100644
>> --- a/fs/f2fs/segment.c
>> +++ b/fs/f2fs/segment.c
>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>> � unsigned int segno;
>> �
>> � mutex_lock(&dirty_i->seglist_lock);
>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>> � __set_test_and_free(sbi, segno);
>> � mutex_unlock(&dirty_i->seglist_lock);
>> �}
>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> � struct list_head *head = &dcc->entry_list;
>> � struct discard_entry *entry, *this;
>> � struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>> + unsigned long *prefree_map;
>> � unsigned int start = 0, end = -1;
>> � unsigned int secno, start_segno;
>> � bool force = (cpc->reason & CP_DISCARD);
>> + int phase = 0;
>> + enum dirty_type dirty_type = PRE2;
>> �
>> � mutex_lock(&dirty_i->seglist_lock);
>> �
>> +next_step:
>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>> � while (1) {
>> � int i;
>> � start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> � for (i = start; i < end; i++)
>> � clear_bit(i, prefree_map);
>> �
>> - dirty_i->nr_dirty[PRE] -= end - start;
>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>> �
>> � if (!test_opt(sbi, DISCARD))
>> � continue;
>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> � else
>> � end = start - 1;
>> � }
>> + if (phase == 0) {
>> + /* status change: PRE -> PRE2 */
>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> + if (!test_and_set_bit(segno, prefree_map))
>> + dirty_i->nr_dirty[PRE2]++;
>> +
>> + phase = 1;
>> + dirty_type = PRE;
>> + goto next_step;
>> + }
>> � mutex_unlock(&dirty_i->seglist_lock);
>> �
>> � /* send small discards */
>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>> �
>> � mutex_lock(&dirty_i->seglist_lock);
>> � __remove_dirty_segment(sbi, new_segno, PRE);
>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>> � __remove_dirty_segment(sbi, new_segno, DIRTY);
>> � mutex_unlock(&dirty_i->seglist_lock);
>> �
>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>> index 71a2aaa..f702726 100644
>> --- a/fs/f2fs/segment.h
>> +++ b/fs/f2fs/segment.h
>> @@ -263,6 +263,7 @@ enum dirty_type {
>> � DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>> � DIRTY, /* to count # of dirty segments */
>> � PRE, /* to count # of entirely obsolete segments */
>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>> � NR_DIRTY_TYPE
>> �};
>> �
>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>> � return DIRTY_I(sbi)->nr_dirty[PRE];
>> �}
>> �
>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>> +{
>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>> +}
>> +
>> �static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>> �{
>> � return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-21  2:34     ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-21  2:34 UTC (permalink / raw)
  To: guoweichao, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

Hi Weichao,

On 2018/1/20 23:50, guoweichao wrote:
> Hi Chao,
> 
> Yes, it is exactly what I mean.
> It seems that F2FS has no responsibility to cover hardware problems.
> However, file systems usually choose redundancy for super block fault tolerance.
> So I think we actually have considered some external errors when designing a file system.
> Our dual checkpoint mechanism is mainly designed to keep at least one stable
> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
> As CP pack is also very critical to F2FS, why not make checkpoint more robustness

I think you're trying to change basic design of A/B upgrade system to A/B/C one,
which can keep always two checkpoint valid. There will be no simple modification
to implement that one, in where we should cover not only prefree case but also
SSR case.

IMO, the biggest problem there is available space, since in checkpoint C, we can
only use invalid blocks in both checkpoint A and B, so in some cases there will
almost be no valid space we can use during allocation, result in frequently
checkpoint.

IMO, what we can do is trying to keep last valid checkpoint being integrity as
possible as we can. One way is that we can add mirror or parity for the
checkpoint which can help to do recovery once checkpoint is corrupted. At
least, I hope that with it in debug version we can help hardware staff to fix
their issue instead of wasting much time to troubleshoot filesystem issue.

Thanks,

> with simple modification in current design and little overhead except for FG_GC.
> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
> 
> Thanks,
> *From:*Chao Yu
> *To:*guoweichao,jaegeuk@kernel.org,
> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
> *Date:*2018-01-20 15:43:23
> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
> 
> Hi Weichao,
> 
> On 2018/1/20 7:29, Weichao Guo wrote:
>> Currently, we set prefree segments as free ones after writing
>> a checkpoint, then believe the segments could be used safely.
>> However, if a sudden power-off coming and the newer checkpoint
>> corrupted due to hardware issues at the same time, we will try
>> to use the old checkpoint and may face an inconsistent file
>> system status.
> 
> IIUC, you mean:
> 
> 1. write nodes into segment x;
> 2. write checkpoint A;
> 3. remove nodes in segment x;
> 4. write checkpoint B;
> 5. issue discard or write datas into segment x;
> 6. sudden power-cut
> 
> But after reboot, we found checkpoint B is corrupted due to hardware, and
> then start to use checkpoint A, but nodes in segment x recorded as valid
> data in checkpoint A has been overcovered in step 5), so we will encounter
> inconsistent meta data, right?
> 
> Thanks,
> 
>>
>> How about add an PRE2 status for prefree segments, and make
>> sure the segments could be used safely to both checkpoints?
>> Or any better solutions? Or this is not a problem?
>>
>> Look forward to your comments!
>>
>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>> ---
>>  fs/f2fs/gc.c      | 11 +++++++++--
>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>  fs/f2fs/segment.h |  6 ++++++
>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index 33e7969..153e3ea 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>    * threshold, we can make them free by checkpoint. Then, we
>>    * secure free segments which doesn't need fggc any more.
>>    */
>> - if (prefree_segments(sbi)) {
>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>> + ret = write_checkpoint(sbi, &cpc);
>> + if (ret)
>> + goto stop;
>> + }
>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>   ret = write_checkpoint(sbi, &cpc);
>>   if (ret)
>>   goto stop;
>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>   goto gc_more;
>>   }
>>  
>> - if (gc_type == FG_GC)
>> + if (gc_type == FG_GC) {
>> + ret = write_checkpoint(sbi, &cpc);
>>   ret = write_checkpoint(sbi, &cpc);
>> + }
>>   }
>>  stop:
>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> index 2e8e054d..9dec445 100644
>> --- a/fs/f2fs/segment.c
>> +++ b/fs/f2fs/segment.c
>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>   unsigned int segno;
>>  
>>   mutex_lock(&dirty_i->seglist_lock);
>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>   __set_test_and_free(sbi, segno);
>>   mutex_unlock(&dirty_i->seglist_lock);
>>  }
>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   struct list_head *head = &dcc->entry_list;
>>   struct discard_entry *entry, *this;
>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>> + unsigned long *prefree_map;
>>   unsigned int start = 0, end = -1;
>>   unsigned int secno, start_segno;
>>   bool force = (cpc->reason & CP_DISCARD);
>> + int phase = 0;
>> + enum dirty_type dirty_type = PRE2;
>>  
>>   mutex_lock(&dirty_i->seglist_lock);
>>  
>> +next_step:
>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>   while (1) {
>>   int i;
>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   for (i = start; i < end; i++)
>>   clear_bit(i, prefree_map);
>>  
>> - dirty_i->nr_dirty[PRE] -= end - start;
>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>  
>>   if (!test_opt(sbi, DISCARD))
>>   continue;
>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>   else
>>   end = start - 1;
>>   }
>> + if (phase == 0) {
>> + /* status change: PRE -> PRE2 */
>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> + if (!test_and_set_bit(segno, prefree_map))
>> + dirty_i->nr_dirty[PRE2]++;
>> +
>> + phase = 1;
>> + dirty_type = PRE;
>> + goto next_step;
>> + }
>>   mutex_unlock(&dirty_i->seglist_lock);
>>  
>>   /* send small discards */
>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>  
>>   mutex_lock(&dirty_i->seglist_lock);
>>   __remove_dirty_segment(sbi, new_segno, PRE);
>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>   mutex_unlock(&dirty_i->seglist_lock);
>>  
>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>> index 71a2aaa..f702726 100644
>> --- a/fs/f2fs/segment.h
>> +++ b/fs/f2fs/segment.h
>> @@ -263,6 +263,7 @@ enum dirty_type {
>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>   DIRTY, /* to count # of dirty segments */
>>   PRE, /* to count # of entirely obsolete segments */
>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>   NR_DIRTY_TYPE
>>  };
>>  
>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>  }
>>  
>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>> +{
>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>> +}
>> +
>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>  {
>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-21  2:34     ` Chao Yu
  (?)
@ 2018-01-21  3:12     ` Gao Xiang via Linux-f2fs-devel
  2018-01-22  8:25         ` Chao Yu
  -1 siblings, 1 reply; 20+ messages in thread
From: Gao Xiang via Linux-f2fs-devel @ 2018-01-21  3:12 UTC (permalink / raw)
  To: Chao Yu, guoweichao; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

Hi Chao and Weichao,


On 2018/1/21 10:34, Chao Yu wrote:
> Hi Weichao,
>
> On 2018/1/20 23:50, guoweichao wrote:
>> Hi Chao,
>>
>> Yes, it is exactly what I mean.
>> It seems that F2FS has no responsibility to cover hardware problems.
>> However, file systems usually choose redundancy for super block fault tolerance.
>> So I think we actually have considered some external errors when designing a file system.
>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
> which can keep always two checkpoint valid. There will be no simple modification
> to implement that one, in where we should cover not only prefree case but also
> SSR case.
*nod*, triple-like-backups would not solve all hardware issues in a even 
worse case,
and in order to make all backups available we need to find more extra 
area to leave all modification among 3 checkpoints.

In my opinion, introducing "snapshot" feature could be something more 
useful for phone (yet it is a big feature :( )
and if fsck found something is unrecoverable
we could roll back to a stable and reliable (via re-checking metadata) 
snapshot (maybe hours ago or days ago,
and remind users when rollback in fsck or somewhere).

Sorry to bother all,

Thanks,
>
> IMO, the biggest problem there is available space, since in checkpoint C, we can
> only use invalid blocks in both checkpoint A and B, so in some cases there will
> almost be no valid space we can use during allocation, result in frequently
> checkpoint.
>
> IMO, what we can do is trying to keep last valid checkpoint being integrity as
> possible as we can. One way is that we can add mirror or parity for the
> checkpoint which can help to do recovery once checkpoint is corrupted. At
> least, I hope that with it in debug version we can help hardware staff to fix
> their issue instead of wasting much time to troubleshoot filesystem issue.
>
> Thanks,
>
>> with simple modification in current design and little overhead except for FG_GC.
>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>
>> Thanks,
>> *From:*Chao Yu
>> *To:*guoweichao,jaegeuk@kernel.org,
>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>> *Date:*2018-01-20 15:43:23
>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>
>> Hi Weichao,
>>
>> On 2018/1/20 7:29, Weichao Guo wrote:
>>> Currently, we set prefree segments as free ones after writing
>>> a checkpoint, then believe the segments could be used safely.
>>> However, if a sudden power-off coming and the newer checkpoint
>>> corrupted due to hardware issues at the same time, we will try
>>> to use the old checkpoint and may face an inconsistent file
>>> system status.
>> IIUC, you mean:
>>
>> 1. write nodes into segment x;
>> 2. write checkpoint A;
>> 3. remove nodes in segment x;
>> 4. write checkpoint B;
>> 5. issue discard or write datas into segment x;
>> 6. sudden power-cut
>>
>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>> then start to use checkpoint A, but nodes in segment x recorded as valid
>> data in checkpoint A has been overcovered in step 5), so we will encounter
>> inconsistent meta data, right?
>>
>> Thanks,
>>
>>> How about add an PRE2 status for prefree segments, and make
>>> sure the segments could be used safely to both checkpoints?
>>> Or any better solutions? Or this is not a problem?
>>>
>>> Look forward to your comments!
>>>
>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>> ---
>>>   fs/f2fs/gc.c      | 11 +++++++++--
>>>   fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>   fs/f2fs/segment.h |  6 ++++++
>>>   3 files changed, 33 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>> index 33e7969..153e3ea 100644
>>> --- a/fs/f2fs/gc.c
>>> +++ b/fs/f2fs/gc.c
>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>     * threshold, we can make them free by checkpoint. Then, we
>>>     * secure free segments which doesn't need fggc any more.
>>>     */
>>> - if (prefree_segments(sbi)) {
>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>> + if (ret)
>>> + goto stop;
>>> + }
>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>    ret = write_checkpoint(sbi, &cpc);
>>>    if (ret)
>>>    goto stop;
>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>    goto gc_more;
>>>    }
>>>   
>>> - if (gc_type == FG_GC)
>>> + if (gc_type == FG_GC) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>>    ret = write_checkpoint(sbi, &cpc);
>>> + }
>>>    }
>>>   stop:
>>>    SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>> index 2e8e054d..9dec445 100644
>>> --- a/fs/f2fs/segment.c
>>> +++ b/fs/f2fs/segment.c
>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>    unsigned int segno;
>>>   
>>>    mutex_lock(&dirty_i->seglist_lock);
>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>    __set_test_and_free(sbi, segno);
>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>   }
>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>    struct list_head *head = &dcc->entry_list;
>>>    struct discard_entry *entry, *this;
>>>    struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>> + unsigned long *prefree_map;
>>>    unsigned int start = 0, end = -1;
>>>    unsigned int secno, start_segno;
>>>    bool force = (cpc->reason & CP_DISCARD);
>>> + int phase = 0;
>>> + enum dirty_type dirty_type = PRE2;
>>>   
>>>    mutex_lock(&dirty_i->seglist_lock);
>>>   
>>> +next_step:
>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>    while (1) {
>>>    int i;
>>>    start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>    for (i = start; i < end; i++)
>>>    clear_bit(i, prefree_map);
>>>   
>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>   
>>>    if (!test_opt(sbi, DISCARD))
>>>    continue;
>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>    else
>>>    end = start - 1;
>>>    }
>>> + if (phase == 0) {
>>> + /* status change: PRE -> PRE2 */
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + if (!test_and_set_bit(segno, prefree_map))
>>> + dirty_i->nr_dirty[PRE2]++;
>>> +
>>> + phase = 1;
>>> + dirty_type = PRE;
>>> + goto next_step;
>>> + }
>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>   
>>>    /* send small discards */
>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>   
>>>    mutex_lock(&dirty_i->seglist_lock);
>>>    __remove_dirty_segment(sbi, new_segno, PRE);
>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>    __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>   
>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>> index 71a2aaa..f702726 100644
>>> --- a/fs/f2fs/segment.h
>>> +++ b/fs/f2fs/segment.h
>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>    DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>    DIRTY, /* to count # of dirty segments */
>>>    PRE, /* to count # of entirely obsolete segments */
>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>    NR_DIRTY_TYPE
>>>   };
>>>   
>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>    return DIRTY_I(sbi)->nr_dirty[PRE];
>>>   }
>>>   
>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>> +{
>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>> +}
>>> +
>>>   static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>   {
>>>    return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most
> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-20  3:48   ` Gao Xiang via Linux-f2fs-devel
@ 2018-01-21  3:27     ` Gao Xiang via Linux-f2fs-devel
  2018-01-22  3:01       ` guoweichao
  0 siblings, 1 reply; 20+ messages in thread
From: Gao Xiang via Linux-f2fs-devel @ 2018-01-21  3:27 UTC (permalink / raw)
  To: guoweichao; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

Hi Weichao,


On 2018/1/21 0:02, guoweichao wrote:
> Hi Xiang,
>
> it's not related to SPOR. Just consider the case given by Chao Yu.
>

(It seems this email was not sent successfully, I resend it just for 
reference only)
Oh I see, I have considered the scenario what Chao said before.

1. write data into segment x;
2. write checkpoint A;
3. remove data in segment x;
4. write checkpoint B;
5. issue discard or write data into segment x;
6. sudden power-cut

Since f2fs is designed for double backup, 5) and 6), I think, actually belongs to checkpoint C.
and when we are in checkpoint C, checkpoint A becomes unsafe because the latest checkpoint is B.
and I think in that case we cannot prevent data writeback or something to pollute checkpoint A.

However, node segments (another metadata) would be special,
but I have no idea whether introducing PRE2 would make all cases safe or not.

In addition, if some data segments change between checkpoint A and C,
some weird data that it didn't have (new data or data from other files) would be gotten when switching back to checkpoint A.


Thanks,

> Thanks,
> *From:*Gao Xiang
> *To:*guoweichao,
> *Cc:*linux-f2fs-devel@lists.sourceforge.net,heyunlei,
> *Date:*2018-01-20 11:49:22
> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one 
> checkpoint but obsolete to the other
>
> Hi Weichao,
>
>
> On 2018/1/19 23:47, guoweichao wrote:
> > A critical case is using the free segments as data segments which
> > are previously node segments to the old checkpoint. With fault injecting
> > of the newer CP pack, fsck can find errors when checking the sanity 
> of nid.
> Sorry to interrupt because I'm just curious about this scenario and the
> detail.
>
> As far as I know, if the whole blocks in a segment become invalid,
> the segment will become PREFREE, and then if a checkpoint is followed,
> we can reuse this segment or
> discard the whole segment safely after this checkpoint was done
> (I think It makes sure that this segment is certainly FREE and not
> reused in this checkpoint).
>
> If the segment in the old checkpoint is a node segment, and node blocks
> in the segment are all invalid until the new checkpoint.
> It seems no danger to reuse the FREE node segment as a data segment in
> the next checkpoint?
>
> or something related to SPOR? In my mind f2fs-tools ignores POR node
> chain...
>
> Thanks,
> > On 2018/1/20 7:29, Weichao Guo wrote:
> >> Currently, we set prefree segments as free ones after writing
> >> a checkpoint, then believe the segments could be used safely.
> >> However, if a sudden power-off coming and the newer checkpoint
> >> corrupted due to hardware issues at the same time, we will try
> >> to use the old checkpoint and may face an inconsistent file
> >> system status.
> >>
> >> How about add an PRE2 status for prefree segments, and make
> >> sure the segments could be used safely to both checkpoints?
> >> Or any better solutions? Or this is not a problem?
> >>
> >> Look forward to your comments!
> >>
> >> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
> >> ---
> >>   fs/f2fs/gc.c      | 11 +++++++++--
> >>   fs/f2fs/segment.c | 21 ++++++++++++++++++---
> >>   fs/f2fs/segment.h |  6 ++++++
> >>   3 files changed, 33 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> >> index 33e7969..153e3ea 100644
> >> --- a/fs/f2fs/gc.c
> >> +++ b/fs/f2fs/gc.c
> >> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
> >>                * threshold, we can make them free by checkpoint. 
> Then, we
> >>                * secure free segments which doesn't need fggc any more.
> >>                */
> >> -            if (prefree_segments(sbi)) {
> >> +            if (prefree_segments(sbi) || prefree2_segments(sbi)) {
> >> +                    ret = write_checkpoint(sbi, &cpc);
> >> +                    if (ret)
> >> +                            goto stop;
> >> +            }
> >> +            if (has_not_enough_free_secs(sbi, 0, 0) && 
> prefree2_segments(sbi)) {
> >>                       ret = write_checkpoint(sbi, &cpc);
> >>                       if (ret)
> >>                               goto stop;
> >> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
> >>                       goto gc_more;
> >>               }
> >>
> >> -            if (gc_type == FG_GC)
> >> +            if (gc_type == FG_GC) {
> >> +                    ret = write_checkpoint(sbi, &cpc);
> >>                       ret = write_checkpoint(sbi, &cpc);
> >> +            }
> >>       }
> >>   stop:
> >>       SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
> >> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> >> index 2e8e054d..9dec445 100644
> >> --- a/fs/f2fs/segment.c
> >> +++ b/fs/f2fs/segment.c
> >> @@ -1606,7 +1606,7 @@ static void 
> set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
> >>       unsigned int segno;
> >>
> >>       mutex_lock(&dirty_i->seglist_lock);
> >> -    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], 
> MAIN_SEGS(sbi))
> >> +    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], 
> MAIN_SEGS(sbi))
> >>               __set_test_and_free(sbi, segno);
> >>       mutex_unlock(&dirty_i->seglist_lock);
> >>   }
> >> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct 
> f2fs_sb_info *sbi, struct cp_control *cpc)
> >>       struct list_head *head = &dcc->entry_list;
> >>       struct discard_entry *entry, *this;
> >>       struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
> >> -    unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
> >> +    unsigned long *prefree_map;
> >>       unsigned int start = 0, end = -1;
> >>       unsigned int secno, start_segno;
> >>       bool force = (cpc->reason & CP_DISCARD);
> >> +    int phase = 0;
> >> +    enum dirty_type dirty_type = PRE2;
> >>
> >>       mutex_lock(&dirty_i->seglist_lock);
> >>
> >> +next_step:
> >> +    prefree_map = dirty_i->dirty_segmap[dirty_type];
> >>       while (1) {
> >>               int i;
> >>               start = find_next_bit(prefree_map, MAIN_SEGS(sbi), 
> end + 1);
> >> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct 
> f2fs_sb_info *sbi, struct cp_control *cpc)
> >>               for (i = start; i < end; i++)
> >>                       clear_bit(i, prefree_map);
> >>
> >> -            dirty_i->nr_dirty[PRE] -= end - start;
> >> +            dirty_i->nr_dirty[dirty_type] -= end - start;
> >>
> >>               if (!test_opt(sbi, DISCARD))
> >>                       continue;
> >> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct 
> f2fs_sb_info *sbi, struct cp_control *cpc)
> >>               else
> >>                       end = start - 1;
> >>       }
> >> +    if (phase == 0) {
> >> +            /* status change: PRE -> PRE2 */
> >> +            for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], 
> MAIN_SEGS(sbi))
> >> +                    if (!test_and_set_bit(segno, prefree_map))
> >> + dirty_i->nr_dirty[PRE2]++;
> >> +
> >> +            phase = 1;
> >> +            dirty_type = PRE;
> >> +            goto next_step;
> >> +    }
> >>       mutex_unlock(&dirty_i->seglist_lock);
> >>
> >>       /* send small discards */
> >> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info 
> *sbi, int type)
> >>
> >>       mutex_lock(&dirty_i->seglist_lock);
> >>       __remove_dirty_segment(sbi, new_segno, PRE);
> >> +    __remove_dirty_segment(sbi, new_segno, PRE2);
> >>       __remove_dirty_segment(sbi, new_segno, DIRTY);
> >>       mutex_unlock(&dirty_i->seglist_lock);
> >>
> >> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> >> index 71a2aaa..f702726 100644
> >> --- a/fs/f2fs/segment.h
> >> +++ b/fs/f2fs/segment.h
> >> @@ -263,6 +263,7 @@ enum dirty_type {
> >>       DIRTY_COLD_NODE,        /* dirty segments assigned as cold 
> node logs */
> >>       DIRTY,                  /* to count # of dirty segments */
> >>       PRE,                    /* to count # of entirely obsolete 
> segments */
> >> +    PRE2,                   /* to count # of the segments free to 
> one checkpoint but obsolete to the other checkpoint */
> >>       NR_DIRTY_TYPE
> >>   };
> >>
> >> @@ -477,6 +478,11 @@ static inline unsigned int 
> prefree_segments(struct f2fs_sb_info *sbi)
> >>       return DIRTY_I(sbi)->nr_dirty[PRE];
> >>   }
> >>
> >> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
> >> +{
> >> +    return DIRTY_I(sbi)->nr_dirty[PRE2];
> >> +}
> >> +
> >>   static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
> >>   {
> >>       return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
> >>
>


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-21  2:34     ` Chao Yu
@ 2018-01-22  2:40       ` guoweichao
  -1 siblings, 0 replies; 20+ messages in thread
From: guoweichao @ 2018-01-22  2:40 UTC (permalink / raw)
  To: Chao Yu, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

Hi Chao,

On 2018/1/21 10:34, Chao Yu wrote:
> Hi Weichao,
> 
> On 2018/1/20 23:50, guoweichao wrote:
>> Hi Chao,
>>
>> Yes, it is exactly what I mean.
>> It seems that F2FS has no responsibility to cover hardware problems.
>> However, file systems usually choose redundancy for super block fault tolerance.
>> So I think we actually have considered some external errors when designing a file system.
>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
> 
> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
> which can keep always two checkpoint valid. There will be no simple modification
> to implement that one, in where we should cover not only prefree case but also
> SSR case.
> 
Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
after two checkpoints are recorded them as invalid. We could mark or record the blocks that
are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
is no other blocks can be used, just write a new checkpoint.

> IMO, the biggest problem there is available space, since in checkpoint C, we can
> only use invalid blocks in both checkpoint A and B, so in some cases there will
> almost be no valid space we can use during allocation, result in frequently
> checkpoint.
> 
> IMO, what we can do is trying to keep last valid checkpoint being integrity as
> possible as we can. One way is that we can add mirror or parity for the
> checkpoint which can help to do recovery once checkpoint is corrupted. At
> least, I hope that with it in debug version we can help hardware staff to fix
> their issue instead of wasting much time to troubleshoot filesystem issue.
> 
Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,
hardware issues are hard to uncover. We should try to separate them from F2FS.

Thanks,

> Thanks,
> 
>> with simple modification in current design and little overhead except for FG_GC.
>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>
>> Thanks,
>> *From:*Chao Yu
>> *To:*guoweichao,jaegeuk@kernel.org,
>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>> *Date:*2018-01-20 15:43:23
>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>
>> Hi Weichao,
>>
>> On 2018/1/20 7:29, Weichao Guo wrote:
>>> Currently, we set prefree segments as free ones after writing
>>> a checkpoint, then believe the segments could be used safely.
>>> However, if a sudden power-off coming and the newer checkpoint
>>> corrupted due to hardware issues at the same time, we will try
>>> to use the old checkpoint and may face an inconsistent file
>>> system status.
>>
>> IIUC, you mean:
>>
>> 1. write nodes into segment x;
>> 2. write checkpoint A;
>> 3. remove nodes in segment x;
>> 4. write checkpoint B;
>> 5. issue discard or write datas into segment x;
>> 6. sudden power-cut
>>
>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>> then start to use checkpoint A, but nodes in segment x recorded as valid
>> data in checkpoint A has been overcovered in step 5), so we will encounter
>> inconsistent meta data, right?
>>
>> Thanks,
>>
>>>
>>> How about add an PRE2 status for prefree segments, and make
>>> sure the segments could be used safely to both checkpoints?
>>> Or any better solutions? Or this is not a problem?
>>>
>>> Look forward to your comments!
>>>
>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>> ---
>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>  fs/f2fs/segment.h |  6 ++++++
>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>> index 33e7969..153e3ea 100644
>>> --- a/fs/f2fs/gc.c
>>> +++ b/fs/f2fs/gc.c
>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>    * threshold, we can make them free by checkpoint. Then, we
>>>    * secure free segments which doesn't need fggc any more.
>>>    */
>>> - if (prefree_segments(sbi)) {
>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>> + if (ret)
>>> + goto stop;
>>> + }
>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>   ret = write_checkpoint(sbi, &cpc);
>>>   if (ret)
>>>   goto stop;
>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>   goto gc_more;
>>>   }
>>>  
>>> - if (gc_type == FG_GC)
>>> + if (gc_type == FG_GC) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>>   ret = write_checkpoint(sbi, &cpc);
>>> + }
>>>   }
>>>  stop:
>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>> index 2e8e054d..9dec445 100644
>>> --- a/fs/f2fs/segment.c
>>> +++ b/fs/f2fs/segment.c
>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>   unsigned int segno;
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>   __set_test_and_free(sbi, segno);
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  }
>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   struct list_head *head = &dcc->entry_list;
>>>   struct discard_entry *entry, *this;
>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>> + unsigned long *prefree_map;
>>>   unsigned int start = 0, end = -1;
>>>   unsigned int secno, start_segno;
>>>   bool force = (cpc->reason & CP_DISCARD);
>>> + int phase = 0;
>>> + enum dirty_type dirty_type = PRE2;
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>>  
>>> +next_step:
>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>   while (1) {
>>>   int i;
>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   for (i = start; i < end; i++)
>>>   clear_bit(i, prefree_map);
>>>  
>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>  
>>>   if (!test_opt(sbi, DISCARD))
>>>   continue;
>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   else
>>>   end = start - 1;
>>>   }
>>> + if (phase == 0) {
>>> + /* status change: PRE -> PRE2 */
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + if (!test_and_set_bit(segno, prefree_map))
>>> + dirty_i->nr_dirty[PRE2]++;
>>> +
>>> + phase = 1;
>>> + dirty_type = PRE;
>>> + goto next_step;
>>> + }
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  
>>>   /* send small discards */
>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  
>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>> index 71a2aaa..f702726 100644
>>> --- a/fs/f2fs/segment.h
>>> +++ b/fs/f2fs/segment.h
>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>   DIRTY, /* to count # of dirty segments */
>>>   PRE, /* to count # of entirely obsolete segments */
>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>   NR_DIRTY_TYPE
>>>  };
>>>  
>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>  }
>>>  
>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>> +{
>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>> +}
>>> +
>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>  {
>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-22  2:40       ` guoweichao
  0 siblings, 0 replies; 20+ messages in thread
From: guoweichao @ 2018-01-22  2:40 UTC (permalink / raw)
  To: Chao Yu, jaegeuk; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

Hi Chao,

On 2018/1/21 10:34, Chao Yu wrote:
> Hi Weichao,
> 
> On 2018/1/20 23:50, guoweichao wrote:
>> Hi Chao,
>>
>> Yes, it is exactly what I mean.
>> It seems that F2FS has no responsibility to cover hardware problems.
>> However, file systems usually choose redundancy for super block fault tolerance.
>> So I think we actually have considered some external errors when designing a file system.
>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
> 
> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
> which can keep always two checkpoint valid. There will be no simple modification
> to implement that one, in where we should cover not only prefree case but also
> SSR case.
> 
Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
after two checkpoints are recorded them as invalid. We could mark or record the blocks that
are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
is no other blocks can be used, just write a new checkpoint.

> IMO, the biggest problem there is available space, since in checkpoint C, we can
> only use invalid blocks in both checkpoint A and B, so in some cases there will
> almost be no valid space we can use during allocation, result in frequently
> checkpoint.
> 
> IMO, what we can do is trying to keep last valid checkpoint being integrity as
> possible as we can. One way is that we can add mirror or parity for the
> checkpoint which can help to do recovery once checkpoint is corrupted. At
> least, I hope that with it in debug version we can help hardware staff to fix
> their issue instead of wasting much time to troubleshoot filesystem issue.
> 
Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,
hardware issues are hard to uncover. We should try to separate them from F2FS.

Thanks,

> Thanks,
> 
>> with simple modification in current design and little overhead except for FG_GC.
>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>
>> Thanks,
>> *From:*Chao Yu
>> *To:*guoweichao,jaegeuk@kernel.org,
>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>> *Date:*2018-01-20 15:43:23
>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>
>> Hi Weichao,
>>
>> On 2018/1/20 7:29, Weichao Guo wrote:
>>> Currently, we set prefree segments as free ones after writing
>>> a checkpoint, then believe the segments could be used safely.
>>> However, if a sudden power-off coming and the newer checkpoint
>>> corrupted due to hardware issues at the same time, we will try
>>> to use the old checkpoint and may face an inconsistent file
>>> system status.
>>
>> IIUC, you mean:
>>
>> 1. write nodes into segment x;
>> 2. write checkpoint A;
>> 3. remove nodes in segment x;
>> 4. write checkpoint B;
>> 5. issue discard or write datas into segment x;
>> 6. sudden power-cut
>>
>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>> then start to use checkpoint A, but nodes in segment x recorded as valid
>> data in checkpoint A has been overcovered in step 5), so we will encounter
>> inconsistent meta data, right?
>>
>> Thanks,
>>
>>>
>>> How about add an PRE2 status for prefree segments, and make
>>> sure the segments could be used safely to both checkpoints?
>>> Or any better solutions? Or this is not a problem?
>>>
>>> Look forward to your comments!
>>>
>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>> ---
>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>  fs/f2fs/segment.h |  6 ++++++
>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>> index 33e7969..153e3ea 100644
>>> --- a/fs/f2fs/gc.c
>>> +++ b/fs/f2fs/gc.c
>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>    * threshold, we can make them free by checkpoint. Then, we
>>>    * secure free segments which doesn't need fggc any more.
>>>    */
>>> - if (prefree_segments(sbi)) {
>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>> + if (ret)
>>> + goto stop;
>>> + }
>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>   ret = write_checkpoint(sbi, &cpc);
>>>   if (ret)
>>>   goto stop;
>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>   goto gc_more;
>>>   }
>>>  
>>> - if (gc_type == FG_GC)
>>> + if (gc_type == FG_GC) {
>>> + ret = write_checkpoint(sbi, &cpc);
>>>   ret = write_checkpoint(sbi, &cpc);
>>> + }
>>>   }
>>>  stop:
>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>> index 2e8e054d..9dec445 100644
>>> --- a/fs/f2fs/segment.c
>>> +++ b/fs/f2fs/segment.c
>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>   unsigned int segno;
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>   __set_test_and_free(sbi, segno);
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  }
>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   struct list_head *head = &dcc->entry_list;
>>>   struct discard_entry *entry, *this;
>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>> + unsigned long *prefree_map;
>>>   unsigned int start = 0, end = -1;
>>>   unsigned int secno, start_segno;
>>>   bool force = (cpc->reason & CP_DISCARD);
>>> + int phase = 0;
>>> + enum dirty_type dirty_type = PRE2;
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>>  
>>> +next_step:
>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>   while (1) {
>>>   int i;
>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   for (i = start; i < end; i++)
>>>   clear_bit(i, prefree_map);
>>>  
>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>  
>>>   if (!test_opt(sbi, DISCARD))
>>>   continue;
>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   else
>>>   end = start - 1;
>>>   }
>>> + if (phase == 0) {
>>> + /* status change: PRE -> PRE2 */
>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>> + if (!test_and_set_bit(segno, prefree_map))
>>> + dirty_i->nr_dirty[PRE2]++;
>>> +
>>> + phase = 1;
>>> + dirty_type = PRE;
>>> + goto next_step;
>>> + }
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  
>>>   /* send small discards */
>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>  
>>>   mutex_lock(&dirty_i->seglist_lock);
>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>  
>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>> index 71a2aaa..f702726 100644
>>> --- a/fs/f2fs/segment.h
>>> +++ b/fs/f2fs/segment.h
>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>   DIRTY, /* to count # of dirty segments */
>>>   PRE, /* to count # of entirely obsolete segments */
>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>   NR_DIRTY_TYPE
>>>  };
>>>  
>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>  }
>>>  
>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>> +{
>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>> +}
>>> +
>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>  {
>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>
> 
> .
> 


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-21  3:27     ` Gao Xiang via Linux-f2fs-devel
@ 2018-01-22  3:01       ` guoweichao
  2018-01-22  5:06         ` Gaoxiang (OS)
  0 siblings, 1 reply; 20+ messages in thread
From: guoweichao @ 2018-01-22  3:01 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

Hi Xiang,

On 2018/1/21 11:27, Gao Xiang wrote:
> Hi Weichao,
> 
> 
> On 2018/1/21 0:02, guoweichao wrote:
>> Hi Xiang,
>>
>> it's not related to SPOR. Just consider the case given by Chao Yu.
>>
> 
> (It seems this email was not sent successfully, I resend it just for reference only)
> Oh I see, I have considered the scenario what Chao said before.
> 
> 1. write data into segment x;
> 2. write checkpoint A;
> 3. remove data in segment x;
> 4. write checkpoint B;
> 5. issue discard or write data into segment x;
> 6. sudden power-cut
> 
> Since f2fs is designed for double backup, 5) and 6), I think, actually belongs to checkpoint C.
> and when we are in checkpoint C, checkpoint A becomes unsafe because the latest checkpoint is B.
> and I think in that case we cannot prevent data writeback or something to pollute checkpoint A.
We do not have to prevent data writeback, we could just delay to reuse the invalid blocks which are
valid to the older checkpoint.
> 
> However, node segments (another metadata) would be special,
> but I have no idea whether introducing PRE2 would make all cases safe or not.
By adding a PRE2 status, we could set prefree node segments in two steps: PRE->PRE2->FREE, and therefore
make sure using free segments are safe to both chepoints. I forgot to consider SSR at first, but the main
idea is similar: avoiding to use invalid blocks that are not invalid to the older checkpoint, writing a
new checkpoint if there is no other invalid blocks.
> 
> In addition, if some data segments change between checkpoint A and C,
> some weird data that it didn't have (new data or data from other files) would be gotten when switching back to checkpoint A.
Data blocks damage do not affect file system consistency.

Anyway, this is a hardware related problem. We could just keep unchanged and try to exposure hardware issues.

Thanks,
> 
> 
> Thanks,
> 
>> Thanks,
>> *From:*Gao Xiang
>> *To:*guoweichao,
>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,heyunlei,
>> *Date:*2018-01-20 11:49:22
>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>
>> Hi Weichao,
>>
>>
>> On 2018/1/19 23:47, guoweichao wrote:
>> > A critical case is using the free segments as data segments which
>> > are previously node segments to the old checkpoint. With fault injecting
>> > of the newer CP pack, fsck can find errors when checking the sanity of nid.
>> Sorry to interrupt because I'm just curious about this scenario and the
>> detail.
>>
>> As far as I know, if the whole blocks in a segment become invalid,
>> the segment will become PREFREE, and then if a checkpoint is followed,
>> we can reuse this segment or
>> discard the whole segment safely after this checkpoint was done
>> (I think It makes sure that this segment is certainly FREE and not
>> reused in this checkpoint).
>>
>> If the segment in the old checkpoint is a node segment, and node blocks
>> in the segment are all invalid until the new checkpoint.
>> It seems no danger to reuse the FREE node segment as a data segment in
>> the next checkpoint?
>>
>> or something related to SPOR? In my mind f2fs-tools ignores POR node
>> chain...
>>
>> Thanks,
>> > On 2018/1/20 7:29, Weichao Guo wrote:
>> >> Currently, we set prefree segments as free ones after writing
>> >> a checkpoint, then believe the segments could be used safely.
>> >> However, if a sudden power-off coming and the newer checkpoint
>> >> corrupted due to hardware issues at the same time, we will try
>> >> to use the old checkpoint and may face an inconsistent file
>> >> system status.
>> >>
>> >> How about add an PRE2 status for prefree segments, and make
>> >> sure the segments could be used safely to both checkpoints?
>> >> Or any better solutions? Or this is not a problem?
>> >>
>> >> Look forward to your comments!
>> >>
>> >> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>> >> ---
>> >>   fs/f2fs/gc.c      | 11 +++++++++--
>> >>   fs/f2fs/segment.c | 21 ++++++++++++++++++---
>> >>   fs/f2fs/segment.h |  6 ++++++
>> >>   3 files changed, 33 insertions(+), 5 deletions(-)
>> >>
>> >> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> >> index 33e7969..153e3ea 100644
>> >> --- a/fs/f2fs/gc.c
>> >> +++ b/fs/f2fs/gc.c
>> >> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>> >>                * threshold, we can make them free by checkpoint. Then, we
>> >>                * secure free segments which doesn't need fggc any more.
>> >>                */
>> >> -            if (prefree_segments(sbi)) {
>> >> +            if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>> >> +                    ret = write_checkpoint(sbi, &cpc);
>> >> +                    if (ret)
>> >> +                            goto stop;
>> >> +            }
>> >> +            if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>> >>                       ret = write_checkpoint(sbi, &cpc);
>> >>                       if (ret)
>> >>                               goto stop;
>> >> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>> >>                       goto gc_more;
>> >>               }
>> >>
>> >> -            if (gc_type == FG_GC)
>> >> +            if (gc_type == FG_GC) {
>> >> +                    ret = write_checkpoint(sbi, &cpc);
>> >>                       ret = write_checkpoint(sbi, &cpc);
>> >> +            }
>> >>       }
>> >>   stop:
>> >>       SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>> >> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> >> index 2e8e054d..9dec445 100644
>> >> --- a/fs/f2fs/segment.c
>> >> +++ b/fs/f2fs/segment.c
>> >> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>> >>       unsigned int segno;
>> >>
>> >>       mutex_lock(&dirty_i->seglist_lock);
>> >> -    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> >> +    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>> >>               __set_test_and_free(sbi, segno);
>> >>       mutex_unlock(&dirty_i->seglist_lock);
>> >>   }
>> >> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> >>       struct list_head *head = &dcc->entry_list;
>> >>       struct discard_entry *entry, *this;
>> >>       struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>> >> -    unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>> >> +    unsigned long *prefree_map;
>> >>       unsigned int start = 0, end = -1;
>> >>       unsigned int secno, start_segno;
>> >>       bool force = (cpc->reason & CP_DISCARD);
>> >> +    int phase = 0;
>> >> +    enum dirty_type dirty_type = PRE2;
>> >>
>> >>       mutex_lock(&dirty_i->seglist_lock);
>> >>
>> >> +next_step:
>> >> +    prefree_map = dirty_i->dirty_segmap[dirty_type];
>> >>       while (1) {
>> >>               int i;
>> >>               start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>> >> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> >>               for (i = start; i < end; i++)
>> >>                       clear_bit(i, prefree_map);
>> >>
>> >> -            dirty_i->nr_dirty[PRE] -= end - start;
>> >> +            dirty_i->nr_dirty[dirty_type] -= end - start;
>> >>
>> >>               if (!test_opt(sbi, DISCARD))
>> >>                       continue;
>> >> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> >>               else
>> >>                       end = start - 1;
>> >>       }
>> >> +    if (phase == 0) {
>> >> +            /* status change: PRE -> PRE2 */
>> >> +            for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>> >> +                    if (!test_and_set_bit(segno, prefree_map))
>> >> + dirty_i->nr_dirty[PRE2]++;
>> >> +
>> >> +            phase = 1;
>> >> +            dirty_type = PRE;
>> >> +            goto next_step;
>> >> +    }
>> >>       mutex_unlock(&dirty_i->seglist_lock);
>> >>
>> >>       /* send small discards */
>> >> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>> >>
>> >>       mutex_lock(&dirty_i->seglist_lock);
>> >>       __remove_dirty_segment(sbi, new_segno, PRE);
>> >> +    __remove_dirty_segment(sbi, new_segno, PRE2);
>> >>       __remove_dirty_segment(sbi, new_segno, DIRTY);
>> >>       mutex_unlock(&dirty_i->seglist_lock);
>> >>
>> >> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>> >> index 71a2aaa..f702726 100644
>> >> --- a/fs/f2fs/segment.h
>> >> +++ b/fs/f2fs/segment.h
>> >> @@ -263,6 +263,7 @@ enum dirty_type {
>> >>       DIRTY_COLD_NODE,        /* dirty segments assigned as cold node logs */
>> >>       DIRTY,                  /* to count # of dirty segments */
>> >>       PRE,                    /* to count # of entirely obsolete segments */
>> >> +    PRE2,                   /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>> >>       NR_DIRTY_TYPE
>> >>   };
>> >>
>> >> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>> >>       return DIRTY_I(sbi)->nr_dirty[PRE];
>> >>   }
>> >>
>> >> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>> >> +{
>> >> +    return DIRTY_I(sbi)->nr_dirty[PRE2];
>> >> +}
>> >> +
>> >>   static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>> >>   {
>> >>       return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>> >>
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-22  3:01       ` guoweichao
@ 2018-01-22  5:06         ` Gaoxiang (OS)
  0 siblings, 0 replies; 20+ messages in thread
From: Gaoxiang (OS) @ 2018-01-22  5:06 UTC (permalink / raw)
  To: guoweichao, Gao Xiang
  Cc: linux-f2fs-devel, linux-fsdevel, heyunlei, chao, Yuchao (T)

Hi Weichao,

On 2018/1/22 11:01, guoweichao wrote:
> Hi Xiang,
> 
> On 2018/1/21 11:27, Gao Xiang wrote:
>> Hi Weichao,
>>
>>
>> On 2018/1/21 0:02, guoweichao wrote:
>>> Hi Xiang,
>>>
>>> it's not related to SPOR. Just consider the case given by Chao Yu.
>>>
>>
>> (It seems this email was not sent successfully, I resend it just for reference only)
>> Oh I see, I have considered the scenario what Chao said before.
>>
>> 1. write data into segment x;
>> 2. write checkpoint A;
>> 3. remove data in segment x;
>> 4. write checkpoint B;
>> 5. issue discard or write data into segment x;
>> 6. sudden power-cut
>>
>> Since f2fs is designed for double backup, 5) and 6), I think, actually belongs to checkpoint C.
>> and when we are in checkpoint C, checkpoint A becomes unsafe because the latest checkpoint is B.
>> and I think in that case we cannot prevent data writeback or something to pollute checkpoint A.
> We do not have to prevent data writeback, we could just delay to reuse the invalid blocks which are
> valid to the older checkpoint.

Yes, your patch misses SSR reuse case, however, an even worse case is 
that when we are writing checkpoint C, partially flush METADATA(I mean 
SIT/NAT), but still not write the cp pack of checkpoint C, and suddenly 
power-off, timing as follows:

1. write checkpoint A
2. write checkpoint B
3. begin to write checkpoint C
4. successful to flush NAT/SIT entries
5. suddenly power-off
x 6. write checkpoint C cp pack x

We could not simply recover checkpoint A in the above case and
months ago I finally agreed all double-backup (or double buffer, whatever)
designs ensure *only one backup valid* --- no exception for f2fs.

By the way, It seems currently f2fs only pollutes(flushes) the SIT/NAT 
when writing checkpoint,
it has benefits -- *avoid the original checkpoint from being polluted as much as it can*,
but I also think that flushing SIT/NATs periodically rather 
than flushing them all in writing chechpoint could reduce the total time of 
writing checkpoint further.

As I said before, In my opinion, A better idea is to implement "snapshot",
1) when we snapshot the older checkpoint, we can re-check all metadata
of the older checkpoint and makes a more reliable and stable snapshot.
-- for example, we are in checkpoint C and snapshot checkpoint B, we
need re-check all metadata of checkpoint B when we make the snapshot.

2) we can control the size of snapshot (if the difference between the 
current checkpoint and the snapshot is larger than xxGB, we could discard
the oldest and build a new snapshot)
or build the snapshot periodically (per-hour, per-day, etc..).



Thanks,

>>
>> However, node segments (another metadata) would be special,
>> but I have no idea whether introducing PRE2 would make all cases safe or not.
> By adding a PRE2 status, we could set prefree node segments in two steps: PRE->PRE2->FREE, and therefore
> make sure using free segments are safe to both chepoints. I forgot to consider SSR at first, but the main
> idea is similar: avoiding to use invalid blocks that are not invalid to the older checkpoint, writing a
> new checkpoint if there is no other invalid blocks.
>>
>> In addition, if some data segments change between checkpoint A and C,
>> some weird data that it didn't have (new data or data from other files) would be gotten when switching back to checkpoint A.
> Data blocks damage do not affect file system consistency.
> 
> Anyway, this is a hardware related problem. We could just keep unchanged and try to exposure hardware issues.
> 
> Thanks,
>>
>>
>> Thanks,
>>
>>> Thanks,
>>> *From:*Gao Xiang
>>> *To:*guoweichao,
>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,heyunlei,
>>> *Date:*2018-01-20 11:49:22
>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>
>>> Hi Weichao,
>>>
>>>
>>> On 2018/1/19 23:47, guoweichao wrote:
>>>> A critical case is using the free segments as data segments which
>>>> are previously node segments to the old checkpoint. With fault injecting
>>>> of the newer CP pack, fsck can find errors when checking the sanity of nid.
>>> Sorry to interrupt because I'm just curious about this scenario and the
>>> detail.
>>>
>>> As far as I know, if the whole blocks in a segment become invalid,
>>> the segment will become PREFREE, and then if a checkpoint is followed,
>>> we can reuse this segment or
>>> discard the whole segment safely after this checkpoint was done
>>> (I think It makes sure that this segment is certainly FREE and not
>>> reused in this checkpoint).
>>>
>>> If the segment in the old checkpoint is a node segment, and node blocks
>>> in the segment are all invalid until the new checkpoint.
>>> It seems no danger to reuse the FREE node segment as a data segment in
>>> the next checkpoint?
>>>
>>> or something related to SPOR? In my mind f2fs-tools ignores POR node
>>> chain...
>>>
>>> Thanks,
>>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>>> Currently, we set prefree segments as free ones after writing
>>>>> a checkpoint, then believe the segments could be used safely.
>>>>> However, if a sudden power-off coming and the newer checkpoint
>>>>> corrupted due to hardware issues at the same time, we will try
>>>>> to use the old checkpoint and may face an inconsistent file
>>>>> system status.
>>>>>
>>>>> How about add an PRE2 status for prefree segments, and make
>>>>> sure the segments could be used safely to both checkpoints?
>>>>> Or any better solutions? Or this is not a problem?
>>>>>
>>>>> Look forward to your comments!
>>>>>
>>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>>> ---
>>>>>    fs/f2fs/gc.c      | 11 +++++++++--
>>>>>    fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>>    fs/f2fs/segment.h |  6 ++++++
>>>>>    3 files changed, 33 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>>> index 33e7969..153e3ea 100644
>>>>> --- a/fs/f2fs/gc.c
>>>>> +++ b/fs/f2fs/gc.c
>>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>                 * threshold, we can make them free by checkpoint. Then, we
>>>>>                 * secure free segments which doesn't need fggc any more.
>>>>>                 */
>>>>> -            if (prefree_segments(sbi)) {
>>>>> +            if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>>> +                    ret = write_checkpoint(sbi, &cpc);
>>>>> +                    if (ret)
>>>>> +                            goto stop;
>>>>> +            }
>>>>> +            if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>>                        ret = write_checkpoint(sbi, &cpc);
>>>>>                        if (ret)
>>>>>                                goto stop;
>>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>                        goto gc_more;
>>>>>                }
>>>>>
>>>>> -            if (gc_type == FG_GC)
>>>>> +            if (gc_type == FG_GC) {
>>>>> +                    ret = write_checkpoint(sbi, &cpc);
>>>>>                        ret = write_checkpoint(sbi, &cpc);
>>>>> +            }
>>>>>        }
>>>>>    stop:
>>>>>        SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>> index 2e8e054d..9dec445 100644
>>>>> --- a/fs/f2fs/segment.c
>>>>> +++ b/fs/f2fs/segment.c
>>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>>        unsigned int segno;
>>>>>
>>>>>        mutex_lock(&dirty_i->seglist_lock);
>>>>> -    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>> +    for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>>                __set_test_and_free(sbi, segno);
>>>>>        mutex_unlock(&dirty_i->seglist_lock);
>>>>>    }
>>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>        struct list_head *head = &dcc->entry_list;
>>>>>        struct discard_entry *entry, *this;
>>>>>        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>>> -    unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>>> +    unsigned long *prefree_map;
>>>>>        unsigned int start = 0, end = -1;
>>>>>        unsigned int secno, start_segno;
>>>>>        bool force = (cpc->reason & CP_DISCARD);
>>>>> +    int phase = 0;
>>>>> +    enum dirty_type dirty_type = PRE2;
>>>>>
>>>>>        mutex_lock(&dirty_i->seglist_lock);
>>>>>
>>>>> +next_step:
>>>>> +    prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>>        while (1) {
>>>>>                int i;
>>>>>                start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>                for (i = start; i < end; i++)
>>>>>                        clear_bit(i, prefree_map);
>>>>>
>>>>> -            dirty_i->nr_dirty[PRE] -= end - start;
>>>>> +            dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>>
>>>>>                if (!test_opt(sbi, DISCARD))
>>>>>                        continue;
>>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>                else
>>>>>                        end = start - 1;
>>>>>        }
>>>>> +    if (phase == 0) {
>>>>> +            /* status change: PRE -> PRE2 */
>>>>> +            for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>> +                    if (!test_and_set_bit(segno, prefree_map))
>>>>> + dirty_i->nr_dirty[PRE2]++;
>>>>> +
>>>>> +            phase = 1;
>>>>> +            dirty_type = PRE;
>>>>> +            goto next_step;
>>>>> +    }
>>>>>        mutex_unlock(&dirty_i->seglist_lock);
>>>>>
>>>>>        /* send small discards */
>>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>>
>>>>>        mutex_lock(&dirty_i->seglist_lock);
>>>>>        __remove_dirty_segment(sbi, new_segno, PRE);
>>>>> +    __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>>        __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>>        mutex_unlock(&dirty_i->seglist_lock);
>>>>>
>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>> index 71a2aaa..f702726 100644
>>>>> --- a/fs/f2fs/segment.h
>>>>> +++ b/fs/f2fs/segment.h
>>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>>        DIRTY_COLD_NODE,        /* dirty segments assigned as cold node logs */
>>>>>        DIRTY,                  /* to count # of dirty segments */
>>>>>        PRE,                    /* to count # of entirely obsolete segments */
>>>>> +    PRE2,                   /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>>        NR_DIRTY_TYPE
>>>>>    };
>>>>>
>>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>>        return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>>    }
>>>>>
>>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>>> +{
>>>>> +    return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>>> +}
>>>>> +
>>>>>    static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>>    {
>>>>>        return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>>
>>>
>>
>>
>> .
>>
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-22  2:40       ` guoweichao
  (?)
@ 2018-01-22  6:19       ` Chao Yu
  2018-01-22  8:03         ` guoweichao
  -1 siblings, 1 reply; 20+ messages in thread
From: Chao Yu @ 2018-01-22  6:19 UTC (permalink / raw)
  To: guoweichao, Chao Yu, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

On 2018/1/22 10:40, guoweichao wrote:
> Hi Chao,
> 
> On 2018/1/21 10:34, Chao Yu wrote:
>> Hi Weichao,
>>
>> On 2018/1/20 23:50, guoweichao wrote:
>>> Hi Chao,
>>>
>>> Yes, it is exactly what I mean.
>>> It seems that F2FS has no responsibility to cover hardware problems.
>>> However, file systems usually choose redundancy for super block fault tolerance.
>>> So I think we actually have considered some external errors when designing a file system.
>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>>
>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>> which can keep always two checkpoint valid. There will be no simple modification
>> to implement that one, in where we should cover not only prefree case but also
>> SSR case.
>>
> Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
> after two checkpoints are recorded them as invalid. We could mark or record the blocks that
> are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
> segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
> is no other blocks can be used, just write a new checkpoint.

Actually, your implementation looks like a variant of A/B/C upgrade system
(or A/B upgrade system), as checkpoint C (or inflight A/B) uses main area
separated from checkpoint A/B, and its inflight dirty metadata will persist
into checkpoint A or B since we haven't actually allocated checkpoint C
area during mkfs.

Anyway, I don't think we have a strong reason to implement this one.

> 
>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>> almost be no valid space we can use during allocation, result in frequently
>> checkpoint.
>>
>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>> possible as we can. One way is that we can add mirror or parity for the
>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>> least, I hope that with it in debug version we can help hardware staff to fix
>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>
> Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,

Parity will be better due to light overhead?

Thanks,

> hardware issues are hard to uncover. We should try to separate them from F2FS.
> 
> Thanks,
> 
>> Thanks,
>>
>>> with simple modification in current design and little overhead except for FG_GC.
>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>
>>> Thanks,
>>> *From:*Chao Yu
>>> *To:*guoweichao,jaegeuk@kernel.org,
>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>> *Date:*2018-01-20 15:43:23
>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>
>>> Hi Weichao,
>>>
>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>> Currently, we set prefree segments as free ones after writing
>>>> a checkpoint, then believe the segments could be used safely.
>>>> However, if a sudden power-off coming and the newer checkpoint
>>>> corrupted due to hardware issues at the same time, we will try
>>>> to use the old checkpoint and may face an inconsistent file
>>>> system status.
>>>
>>> IIUC, you mean:
>>>
>>> 1. write nodes into segment x;
>>> 2. write checkpoint A;
>>> 3. remove nodes in segment x;
>>> 4. write checkpoint B;
>>> 5. issue discard or write datas into segment x;
>>> 6. sudden power-cut
>>>
>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>> inconsistent meta data, right?
>>>
>>> Thanks,
>>>
>>>>
>>>> How about add an PRE2 status for prefree segments, and make
>>>> sure the segments could be used safely to both checkpoints?
>>>> Or any better solutions? Or this is not a problem?
>>>>
>>>> Look forward to your comments!
>>>>
>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>> ---
>>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>  fs/f2fs/segment.h |  6 ++++++
>>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>> index 33e7969..153e3ea 100644
>>>> --- a/fs/f2fs/gc.c
>>>> +++ b/fs/f2fs/gc.c
>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>    * threshold, we can make them free by checkpoint. Then, we
>>>>    * secure free segments which doesn't need fggc any more.
>>>>    */
>>>> - if (prefree_segments(sbi)) {
>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>> + if (ret)
>>>> + goto stop;
>>>> + }
>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>   if (ret)
>>>>   goto stop;
>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>   goto gc_more;
>>>>   }
>>>>  
>>>> - if (gc_type == FG_GC)
>>>> + if (gc_type == FG_GC) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>   ret = write_checkpoint(sbi, &cpc);
>>>> + }
>>>>   }
>>>>  stop:
>>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>> index 2e8e054d..9dec445 100644
>>>> --- a/fs/f2fs/segment.c
>>>> +++ b/fs/f2fs/segment.c
>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>   unsigned int segno;
>>>>  
>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>   __set_test_and_free(sbi, segno);
>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>  }
>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>   struct list_head *head = &dcc->entry_list;
>>>>   struct discard_entry *entry, *this;
>>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>> + unsigned long *prefree_map;
>>>>   unsigned int start = 0, end = -1;
>>>>   unsigned int secno, start_segno;
>>>>   bool force = (cpc->reason & CP_DISCARD);
>>>> + int phase = 0;
>>>> + enum dirty_type dirty_type = PRE2;
>>>>  
>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>  
>>>> +next_step:
>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>   while (1) {
>>>>   int i;
>>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>   for (i = start; i < end; i++)
>>>>   clear_bit(i, prefree_map);
>>>>  
>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>  
>>>>   if (!test_opt(sbi, DISCARD))
>>>>   continue;
>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>   else
>>>>   end = start - 1;
>>>>   }
>>>> + if (phase == 0) {
>>>> + /* status change: PRE -> PRE2 */
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>> + dirty_i->nr_dirty[PRE2]++;
>>>> +
>>>> + phase = 1;
>>>> + dirty_type = PRE;
>>>> + goto next_step;
>>>> + }
>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>  
>>>>   /* send small discards */
>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>  
>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>  
>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>> index 71a2aaa..f702726 100644
>>>> --- a/fs/f2fs/segment.h
>>>> +++ b/fs/f2fs/segment.h
>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>   DIRTY, /* to count # of dirty segments */
>>>>   PRE, /* to count # of entirely obsolete segments */
>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>   NR_DIRTY_TYPE
>>>>  };
>>>>  
>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>  }
>>>>  
>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>> +{
>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>> +}
>>>> +
>>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>  {
>>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>
>>
>> .
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-22  6:19       ` Chao Yu
@ 2018-01-22  8:03         ` guoweichao
  2018-01-22  8:27             ` Chao Yu
  0 siblings, 1 reply; 20+ messages in thread
From: guoweichao @ 2018-01-22  8:03 UTC (permalink / raw)
  To: Chao Yu, Chao Yu, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei



On 2018/1/22 14:19, Chao Yu wrote:
> On 2018/1/22 10:40, guoweichao wrote:
>> Hi Chao,
>>
>> On 2018/1/21 10:34, Chao Yu wrote:
>>> Hi Weichao,
>>>
>>> On 2018/1/20 23:50, guoweichao wrote:
>>>> Hi Chao,
>>>>
>>>> Yes, it is exactly what I mean.
>>>> It seems that F2FS has no responsibility to cover hardware problems.
>>>> However, file systems usually choose redundancy for super block fault tolerance.
>>>> So I think we actually have considered some external errors when designing a file system.
>>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>>>
>>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>>> which can keep always two checkpoint valid. There will be no simple modification
>>> to implement that one, in where we should cover not only prefree case but also
>>> SSR case.
>>>
>> Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
>> after two checkpoints are recorded them as invalid. We could mark or record the blocks that
>> are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
>> segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
>> is no other blocks can be used, just write a new checkpoint.
> 
> Actually, your implementation looks like a variant of A/B/C upgrade system
> (or A/B upgrade system), as checkpoint C (or inflight A/B) uses main area
> separated from checkpoint A/B, and its inflight dirty metadata will persist
> into checkpoint A or B since we haven't actually allocated checkpoint C
> area during mkfs.
> 
> Anyway, I don't think we have a strong reason to implement this one.
> 
>>
>>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>>> almost be no valid space we can use during allocation, result in frequently
>>> checkpoint.
>>>
>>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>>> possible as we can. One way is that we can add mirror or parity for the
>>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>>> least, I hope that with it in debug version we can help hardware staff to fix
>>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>>
>> Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,
> 
> Parity will be better due to light overhead?
Yes, backup may also cause disk layout changes. How about adding a parity block for the blocks
in a checkpoint similar to RAID-5 or other erasure code methods?

Thanks,

> 
> Thanks,
> 
>> hardware issues are hard to uncover. We should try to separate them from F2FS.
>>
>> Thanks,
>>
>>> Thanks,
>>>
>>>> with simple modification in current design and little overhead except for FG_GC.
>>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>>
>>>> Thanks,
>>>> *From:*Chao Yu
>>>> *To:*guoweichao,jaegeuk@kernel.org,
>>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>>> *Date:*2018-01-20 15:43:23
>>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>>
>>>> Hi Weichao,
>>>>
>>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>>> Currently, we set prefree segments as free ones after writing
>>>>> a checkpoint, then believe the segments could be used safely.
>>>>> However, if a sudden power-off coming and the newer checkpoint
>>>>> corrupted due to hardware issues at the same time, we will try
>>>>> to use the old checkpoint and may face an inconsistent file
>>>>> system status.
>>>>
>>>> IIUC, you mean:
>>>>
>>>> 1. write nodes into segment x;
>>>> 2. write checkpoint A;
>>>> 3. remove nodes in segment x;
>>>> 4. write checkpoint B;
>>>> 5. issue discard or write datas into segment x;
>>>> 6. sudden power-cut
>>>>
>>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>>> inconsistent meta data, right?
>>>>
>>>> Thanks,
>>>>
>>>>>
>>>>> How about add an PRE2 status for prefree segments, and make
>>>>> sure the segments could be used safely to both checkpoints?
>>>>> Or any better solutions? Or this is not a problem?
>>>>>
>>>>> Look forward to your comments!
>>>>>
>>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>>> ---
>>>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>>  fs/f2fs/segment.h |  6 ++++++
>>>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>>> index 33e7969..153e3ea 100644
>>>>> --- a/fs/f2fs/gc.c
>>>>> +++ b/fs/f2fs/gc.c
>>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>    * threshold, we can make them free by checkpoint. Then, we
>>>>>    * secure free segments which doesn't need fggc any more.
>>>>>    */
>>>>> - if (prefree_segments(sbi)) {
>>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>> + if (ret)
>>>>> + goto stop;
>>>>> + }
>>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>>   if (ret)
>>>>>   goto stop;
>>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>   goto gc_more;
>>>>>   }
>>>>>  
>>>>> - if (gc_type == FG_GC)
>>>>> + if (gc_type == FG_GC) {
>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>> + }
>>>>>   }
>>>>>  stop:
>>>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>> index 2e8e054d..9dec445 100644
>>>>> --- a/fs/f2fs/segment.c
>>>>> +++ b/fs/f2fs/segment.c
>>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>>   unsigned int segno;
>>>>>  
>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>>   __set_test_and_free(sbi, segno);
>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>  }
>>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>   struct list_head *head = &dcc->entry_list;
>>>>>   struct discard_entry *entry, *this;
>>>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>>> + unsigned long *prefree_map;
>>>>>   unsigned int start = 0, end = -1;
>>>>>   unsigned int secno, start_segno;
>>>>>   bool force = (cpc->reason & CP_DISCARD);
>>>>> + int phase = 0;
>>>>> + enum dirty_type dirty_type = PRE2;
>>>>>  
>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>  
>>>>> +next_step:
>>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>>   while (1) {
>>>>>   int i;
>>>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>   for (i = start; i < end; i++)
>>>>>   clear_bit(i, prefree_map);
>>>>>  
>>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>>  
>>>>>   if (!test_opt(sbi, DISCARD))
>>>>>   continue;
>>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>   else
>>>>>   end = start - 1;
>>>>>   }
>>>>> + if (phase == 0) {
>>>>> + /* status change: PRE -> PRE2 */
>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>>> + dirty_i->nr_dirty[PRE2]++;
>>>>> +
>>>>> + phase = 1;
>>>>> + dirty_type = PRE;
>>>>> + goto next_step;
>>>>> + }
>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>  
>>>>>   /* send small discards */
>>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>>  
>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>  
>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>> index 71a2aaa..f702726 100644
>>>>> --- a/fs/f2fs/segment.h
>>>>> +++ b/fs/f2fs/segment.h
>>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>>   DIRTY, /* to count # of dirty segments */
>>>>>   PRE, /* to count # of entirely obsolete segments */
>>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>>   NR_DIRTY_TYPE
>>>>>  };
>>>>>  
>>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>>  }
>>>>>  
>>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>>> +{
>>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>>> +}
>>>>> +
>>>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>>  {
>>>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>>
>>>
>>> .
>>>
>>
>>
>> .
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-21  3:12     ` Gao Xiang via Linux-f2fs-devel
@ 2018-01-22  8:25         ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-22  8:25 UTC (permalink / raw)
  To: Gao Xiang, Chao Yu, guoweichao; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

Hi Xiang,

On 2018/1/21 11:12, Gao Xiang via Linux-f2fs-devel wrote:
> Hi Chao and Weichao,
> 
> 
> On 2018/1/21 10:34, Chao Yu wrote:
>> Hi Weichao,
>>
>> On 2018/1/20 23:50, guoweichao wrote:
>>> Hi Chao,
>>>
>>> Yes, it is exactly what I mean.
>>> It seems that F2FS has no responsibility to cover hardware problems.
>>> However, file systems usually choose redundancy for super block fault tolerance.
>>> So I think we actually have considered some external errors when designing a file system.
>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>> which can keep always two checkpoint valid. There will be no simple modification
>> to implement that one, in where we should cover not only prefree case but also
>> SSR case.
> *nod*, triple-like-backups would not solve all hardware issues in a even 
> worse case,
> and in order to make all backups available we need to find more extra 
> area to leave all modification among 3 checkpoints.
> 
> In my opinion, introducing "snapshot" feature could be something more 
> useful for phone (yet it is a big feature :( )

Yes, I think we could use that way, but now, I guess Weichao wants a quick
and simple solution in production, snapshot will cause heavy overhead in
current scenario, if there are other usage scenario in userspace for
snapshot, we can add it more reasonably, right? ;)

Thanks,

> and if fsck found something is unrecoverable
> we could roll back to a stable and reliable (via re-checking metadata) 
> snapshot (maybe hours ago or days ago,
> and remind users when rollback in fsck or somewhere).
> 
> Sorry to bother all,
> 
> Thanks,
>>
>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>> almost be no valid space we can use during allocation, result in frequently
>> checkpoint.
>>
>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>> possible as we can. One way is that we can add mirror or parity for the
>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>> least, I hope that with it in debug version we can help hardware staff to fix
>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>
>> Thanks,
>>
>>> with simple modification in current design and little overhead except for FG_GC.
>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>
>>> Thanks,
>>> *From:*Chao Yu
>>> *To:*guoweichao,jaegeuk@kernel.org,
>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>> *Date:*2018-01-20 15:43:23
>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>
>>> Hi Weichao,
>>>
>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>> Currently, we set prefree segments as free ones after writing
>>>> a checkpoint, then believe the segments could be used safely.
>>>> However, if a sudden power-off coming and the newer checkpoint
>>>> corrupted due to hardware issues at the same time, we will try
>>>> to use the old checkpoint and may face an inconsistent file
>>>> system status.
>>> IIUC, you mean:
>>>
>>> 1. write nodes into segment x;
>>> 2. write checkpoint A;
>>> 3. remove nodes in segment x;
>>> 4. write checkpoint B;
>>> 5. issue discard or write datas into segment x;
>>> 6. sudden power-cut
>>>
>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>> inconsistent meta data, right?
>>>
>>> Thanks,
>>>
>>>> How about add an PRE2 status for prefree segments, and make
>>>> sure the segments could be used safely to both checkpoints?
>>>> Or any better solutions? Or this is not a problem?
>>>>
>>>> Look forward to your comments!
>>>>
>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>> ---
>>>>  �fs/f2fs/gc.c �����| 11 +++++++++--
>>>>  �fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>  �fs/f2fs/segment.h | �6 ++++++
>>>>  �3 files changed, 33 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>> index 33e7969..153e3ea 100644
>>>> --- a/fs/f2fs/gc.c
>>>> +++ b/fs/f2fs/gc.c
>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>  � �* threshold, we can make them free by checkpoint. Then, we
>>>>  � �* secure free segments which doesn't need fggc any more.
>>>>  � �*/
>>>> - if (prefree_segments(sbi)) {
>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>> + if (ret)
>>>> + goto stop;
>>>> + }
>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>  � ret = write_checkpoint(sbi, &cpc);
>>>>  � if (ret)
>>>>  � goto stop;
>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>  � goto gc_more;
>>>>  � }
>>>>   
>>>> - if (gc_type == FG_GC)
>>>> + if (gc_type == FG_GC) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>  � ret = write_checkpoint(sbi, &cpc);
>>>> + }
>>>>  � }
>>>>  �stop:
>>>>  � SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>> index 2e8e054d..9dec445 100644
>>>> --- a/fs/f2fs/segment.c
>>>> +++ b/fs/f2fs/segment.c
>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>  � unsigned int segno;
>>>>   
>>>>  � mutex_lock(&dirty_i->seglist_lock);
>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>  � __set_test_and_free(sbi, segno);
>>>>  � mutex_unlock(&dirty_i->seglist_lock);
>>>>  �}
>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>  � struct list_head *head = &dcc->entry_list;
>>>>  � struct discard_entry *entry, *this;
>>>>  � struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>> + unsigned long *prefree_map;
>>>>  � unsigned int start = 0, end = -1;
>>>>  � unsigned int secno, start_segno;
>>>>  � bool force = (cpc->reason & CP_DISCARD);
>>>> + int phase = 0;
>>>> + enum dirty_type dirty_type = PRE2;
>>>>   
>>>>  � mutex_lock(&dirty_i->seglist_lock);
>>>>   
>>>> +next_step:
>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>  � while (1) {
>>>>  � int i;
>>>>  � start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>  � for (i = start; i < end; i++)
>>>>  � clear_bit(i, prefree_map);
>>>>   
>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>   
>>>>  � if (!test_opt(sbi, DISCARD))
>>>>  � continue;
>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>  � else
>>>>  � end = start - 1;
>>>>  � }
>>>> + if (phase == 0) {
>>>> + /* status change: PRE -> PRE2 */
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>> + dirty_i->nr_dirty[PRE2]++;
>>>> +
>>>> + phase = 1;
>>>> + dirty_type = PRE;
>>>> + goto next_step;
>>>> + }
>>>>  � mutex_unlock(&dirty_i->seglist_lock);
>>>>   
>>>>  � /* send small discards */
>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>   
>>>>  � mutex_lock(&dirty_i->seglist_lock);
>>>>  � __remove_dirty_segment(sbi, new_segno, PRE);
>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>  � __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>  � mutex_unlock(&dirty_i->seglist_lock);
>>>>   
>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>> index 71a2aaa..f702726 100644
>>>> --- a/fs/f2fs/segment.h
>>>> +++ b/fs/f2fs/segment.h
>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>  � DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>  � DIRTY, /* to count # of dirty segments */
>>>>  � PRE, /* to count # of entirely obsolete segments */
>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>  � NR_DIRTY_TYPE
>>>>  �};
>>>>   
>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>  � return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>  �}
>>>>   
>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>> +{
>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>> +}
>>>> +
>>>>  �static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>  �{
>>>>  � return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>
>> ------------------------------------------------------------------------------
>> Check out the vibrant tech community on one of the world's most
>> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
>> _______________________________________________
>> Linux-f2fs-devel mailing list
>> Linux-f2fs-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> 
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most
> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-22  8:25         ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-22  8:25 UTC (permalink / raw)
  To: Gao Xiang, Chao Yu, guoweichao; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

Hi Xiang,

On 2018/1/21 11:12, Gao Xiang via Linux-f2fs-devel wrote:
> Hi Chao and Weichao,
> 
> 
> On 2018/1/21 10:34, Chao Yu wrote:
>> Hi Weichao,
>>
>> On 2018/1/20 23:50, guoweichao wrote:
>>> Hi Chao,
>>>
>>> Yes, it is exactly what I mean.
>>> It seems that F2FS has no responsibility to cover hardware problems.
>>> However, file systems usually choose redundancy for super block fault tolerance.
>>> So I think we actually have considered some external errors when designing a file system.
>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>> which can keep always two checkpoint valid. There will be no simple modification
>> to implement that one, in where we should cover not only prefree case but also
>> SSR case.
> *nod*, triple-like-backups would not solve all hardware issues in a even 
> worse case,
> and in order to make all backups available we need to find more extra 
> area to leave all modification among 3 checkpoints.
> 
> In my opinion, introducing "snapshot" feature could be something more 
> useful for phone (yet it is a big feature :( )

Yes, I think we could use that way, but now, I guess Weichao wants a quick
and simple solution in production, snapshot will cause heavy overhead in
current scenario, if there are other usage scenario in userspace for
snapshot, we can add it more reasonably, right? ;)

Thanks,

> and if fsck found something is unrecoverable
> we could roll back to a stable and reliable (via re-checking metadata) 
> snapshot (maybe hours ago or days ago,
> and remind users when rollback in fsck or somewhere).
> 
> Sorry to bother all,
> 
> Thanks,
>>
>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>> almost be no valid space we can use during allocation, result in frequently
>> checkpoint.
>>
>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>> possible as we can. One way is that we can add mirror or parity for the
>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>> least, I hope that with it in debug version we can help hardware staff to fix
>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>
>> Thanks,
>>
>>> with simple modification in current design and little overhead except for FG_GC.
>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>
>>> Thanks,
>>> *From:*Chao Yu
>>> *To:*guoweichao,jaegeuk@kernel.org,
>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>> *Date:*2018-01-20 15:43:23
>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>
>>> Hi Weichao,
>>>
>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>> Currently, we set prefree segments as free ones after writing
>>>> a checkpoint, then believe the segments could be used safely.
>>>> However, if a sudden power-off coming and the newer checkpoint
>>>> corrupted due to hardware issues at the same time, we will try
>>>> to use the old checkpoint and may face an inconsistent file
>>>> system status.
>>> IIUC, you mean:
>>>
>>> 1. write nodes into segment x;
>>> 2. write checkpoint A;
>>> 3. remove nodes in segment x;
>>> 4. write checkpoint B;
>>> 5. issue discard or write datas into segment x;
>>> 6. sudden power-cut
>>>
>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>> inconsistent meta data, right?
>>>
>>> Thanks,
>>>
>>>> How about add an PRE2 status for prefree segments, and make
>>>> sure the segments could be used safely to both checkpoints?
>>>> Or any better solutions? Or this is not a problem?
>>>>
>>>> Look forward to your comments!
>>>>
>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>> ---
>>>>   fs/f2fs/gc.c      | 11 +++++++++--
>>>>   fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>   fs/f2fs/segment.h |  6 ++++++
>>>>   3 files changed, 33 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>> index 33e7969..153e3ea 100644
>>>> --- a/fs/f2fs/gc.c
>>>> +++ b/fs/f2fs/gc.c
>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>     * threshold, we can make them free by checkpoint. Then, we
>>>>     * secure free segments which doesn't need fggc any more.
>>>>     */
>>>> - if (prefree_segments(sbi)) {
>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>> + if (ret)
>>>> + goto stop;
>>>> + }
>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>    ret = write_checkpoint(sbi, &cpc);
>>>>    if (ret)
>>>>    goto stop;
>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>    goto gc_more;
>>>>    }
>>>>   
>>>> - if (gc_type == FG_GC)
>>>> + if (gc_type == FG_GC) {
>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>    ret = write_checkpoint(sbi, &cpc);
>>>> + }
>>>>    }
>>>>   stop:
>>>>    SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>> index 2e8e054d..9dec445 100644
>>>> --- a/fs/f2fs/segment.c
>>>> +++ b/fs/f2fs/segment.c
>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>    unsigned int segno;
>>>>   
>>>>    mutex_lock(&dirty_i->seglist_lock);
>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>    __set_test_and_free(sbi, segno);
>>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>>   }
>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>    struct list_head *head = &dcc->entry_list;
>>>>    struct discard_entry *entry, *this;
>>>>    struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>> + unsigned long *prefree_map;
>>>>    unsigned int start = 0, end = -1;
>>>>    unsigned int secno, start_segno;
>>>>    bool force = (cpc->reason & CP_DISCARD);
>>>> + int phase = 0;
>>>> + enum dirty_type dirty_type = PRE2;
>>>>   
>>>>    mutex_lock(&dirty_i->seglist_lock);
>>>>   
>>>> +next_step:
>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>    while (1) {
>>>>    int i;
>>>>    start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>    for (i = start; i < end; i++)
>>>>    clear_bit(i, prefree_map);
>>>>   
>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>   
>>>>    if (!test_opt(sbi, DISCARD))
>>>>    continue;
>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>    else
>>>>    end = start - 1;
>>>>    }
>>>> + if (phase == 0) {
>>>> + /* status change: PRE -> PRE2 */
>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>> + dirty_i->nr_dirty[PRE2]++;
>>>> +
>>>> + phase = 1;
>>>> + dirty_type = PRE;
>>>> + goto next_step;
>>>> + }
>>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>>   
>>>>    /* send small discards */
>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>   
>>>>    mutex_lock(&dirty_i->seglist_lock);
>>>>    __remove_dirty_segment(sbi, new_segno, PRE);
>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>    __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>    mutex_unlock(&dirty_i->seglist_lock);
>>>>   
>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>> index 71a2aaa..f702726 100644
>>>> --- a/fs/f2fs/segment.h
>>>> +++ b/fs/f2fs/segment.h
>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>    DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>    DIRTY, /* to count # of dirty segments */
>>>>    PRE, /* to count # of entirely obsolete segments */
>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>    NR_DIRTY_TYPE
>>>>   };
>>>>   
>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>    return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>   }
>>>>   
>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>> +{
>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>> +}
>>>> +
>>>>   static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>   {
>>>>    return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>
>> ------------------------------------------------------------------------------
>> Check out the vibrant tech community on one of the world's most
>> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
>> _______________________________________________
>> Linux-f2fs-devel mailing list
>> Linux-f2fs-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> 
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most
> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
  2018-01-22  8:03         ` guoweichao
@ 2018-01-22  8:27             ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-22  8:27 UTC (permalink / raw)
  To: guoweichao, Chao Yu, jaegeuk; +Cc: linux-f2fs-devel, linux-fsdevel, heyunlei

On 2018/1/22 16:03, guoweichao wrote:
> 
> 
> On 2018/1/22 14:19, Chao Yu wrote:
>> On 2018/1/22 10:40, guoweichao wrote:
>>> Hi Chao,
>>>
>>> On 2018/1/21 10:34, Chao Yu wrote:
>>>> Hi Weichao,
>>>>
>>>> On 2018/1/20 23:50, guoweichao wrote:
>>>>> Hi Chao,
>>>>>
>>>>> Yes, it is exactly what I mean.
>>>>> It seems that F2FS has no responsibility to cover hardware problems.
>>>>> However, file systems usually choose redundancy for super block fault tolerance.
>>>>> So I think we actually have considered some external errors when designing a file system.
>>>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>>>>
>>>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>>>> which can keep always two checkpoint valid. There will be no simple modification
>>>> to implement that one, in where we should cover not only prefree case but also
>>>> SSR case.
>>>>
>>> Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
>>> after two checkpoints are recorded them as invalid. We could mark or record the blocks that
>>> are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
>>> segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
>>> is no other blocks can be used, just write a new checkpoint.
>>
>> Actually, your implementation looks like a variant of A/B/C upgrade system
>> (or A/B upgrade system), as checkpoint C (or inflight A/B) uses main area
>> separated from checkpoint A/B, and its inflight dirty metadata will persist
>> into checkpoint A or B since we haven't actually allocated checkpoint C
>> area during mkfs.
>>
>> Anyway, I don't think we have a strong reason to implement this one.
>>
>>>
>>>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>>>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>>>> almost be no valid space we can use during allocation, result in frequently
>>>> checkpoint.
>>>>
>>>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>>>> possible as we can. One way is that we can add mirror or parity for the
>>>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>>>> least, I hope that with it in debug version we can help hardware staff to fix
>>>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>>>
>>> Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,
>>
>> Parity will be better due to light overhead?
> Yes, backup may also cause disk layout changes. How about adding a parity block for the blocks

Right,

> in a checkpoint similar to RAID-5 or other erasure code methods?

Yes, I think we can.

Thanks,

> 
> Thanks,
> 
>>
>> Thanks,
>>
>>> hardware issues are hard to uncover. We should try to separate them from F2FS.
>>>
>>> Thanks,
>>>
>>>> Thanks,
>>>>
>>>>> with simple modification in current design and little overhead except for FG_GC.
>>>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>>>
>>>>> Thanks,
>>>>> *From:*Chao Yu
>>>>> *To:*guoweichao,jaegeuk@kernel.org,
>>>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>>>> *Date:*2018-01-20 15:43:23
>>>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>>>
>>>>> Hi Weichao,
>>>>>
>>>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>>>> Currently, we set prefree segments as free ones after writing
>>>>>> a checkpoint, then believe the segments could be used safely.
>>>>>> However, if a sudden power-off coming and the newer checkpoint
>>>>>> corrupted due to hardware issues at the same time, we will try
>>>>>> to use the old checkpoint and may face an inconsistent file
>>>>>> system status.
>>>>>
>>>>> IIUC, you mean:
>>>>>
>>>>> 1. write nodes into segment x;
>>>>> 2. write checkpoint A;
>>>>> 3. remove nodes in segment x;
>>>>> 4. write checkpoint B;
>>>>> 5. issue discard or write datas into segment x;
>>>>> 6. sudden power-cut
>>>>>
>>>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>>>> inconsistent meta data, right?
>>>>>
>>>>> Thanks,
>>>>>
>>>>>>
>>>>>> How about add an PRE2 status for prefree segments, and make
>>>>>> sure the segments could be used safely to both checkpoints?
>>>>>> Or any better solutions? Or this is not a problem?
>>>>>>
>>>>>> Look forward to your comments!
>>>>>>
>>>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>>>> ---
>>>>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>>>  fs/f2fs/segment.h |  6 ++++++
>>>>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>>>> index 33e7969..153e3ea 100644
>>>>>> --- a/fs/f2fs/gc.c
>>>>>> +++ b/fs/f2fs/gc.c
>>>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>>    * threshold, we can make them free by checkpoint. Then, we
>>>>>>    * secure free segments which doesn't need fggc any more.
>>>>>>    */
>>>>>> - if (prefree_segments(sbi)) {
>>>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>>> + if (ret)
>>>>>> + goto stop;
>>>>>> + }
>>>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>>>   if (ret)
>>>>>>   goto stop;
>>>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>>   goto gc_more;
>>>>>>   }
>>>>>>  
>>>>>> - if (gc_type == FG_GC)
>>>>>> + if (gc_type == FG_GC) {
>>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>>> + }
>>>>>>   }
>>>>>>  stop:
>>>>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>>> index 2e8e054d..9dec445 100644
>>>>>> --- a/fs/f2fs/segment.c
>>>>>> +++ b/fs/f2fs/segment.c
>>>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>>>   unsigned int segno;
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>>>   __set_test_and_free(sbi, segno);
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  }
>>>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   struct list_head *head = &dcc->entry_list;
>>>>>>   struct discard_entry *entry, *this;
>>>>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>>>> + unsigned long *prefree_map;
>>>>>>   unsigned int start = 0, end = -1;
>>>>>>   unsigned int secno, start_segno;
>>>>>>   bool force = (cpc->reason & CP_DISCARD);
>>>>>> + int phase = 0;
>>>>>> + enum dirty_type dirty_type = PRE2;
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>> +next_step:
>>>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>>>   while (1) {
>>>>>>   int i;
>>>>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   for (i = start; i < end; i++)
>>>>>>   clear_bit(i, prefree_map);
>>>>>>  
>>>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>>>  
>>>>>>   if (!test_opt(sbi, DISCARD))
>>>>>>   continue;
>>>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   else
>>>>>>   end = start - 1;
>>>>>>   }
>>>>>> + if (phase == 0) {
>>>>>> + /* status change: PRE -> PRE2 */
>>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>>>> + dirty_i->nr_dirty[PRE2]++;
>>>>>> +
>>>>>> + phase = 1;
>>>>>> + dirty_type = PRE;
>>>>>> + goto next_step;
>>>>>> + }
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>>   /* send small discards */
>>>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>>> index 71a2aaa..f702726 100644
>>>>>> --- a/fs/f2fs/segment.h
>>>>>> +++ b/fs/f2fs/segment.h
>>>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>>>   DIRTY, /* to count # of dirty segments */
>>>>>>   PRE, /* to count # of entirely obsolete segments */
>>>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>>>   NR_DIRTY_TYPE
>>>>>>  };
>>>>>>  
>>>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>>>  }
>>>>>>  
>>>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>>>> +{
>>>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>>>> +}
>>>>>> +
>>>>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>>>  {
>>>>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>> .
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
@ 2018-01-22  8:27             ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2018-01-22  8:27 UTC (permalink / raw)
  To: guoweichao, Chao Yu, jaegeuk; +Cc: linux-fsdevel, heyunlei, linux-f2fs-devel

On 2018/1/22 16:03, guoweichao wrote:
> 
> 
> On 2018/1/22 14:19, Chao Yu wrote:
>> On 2018/1/22 10:40, guoweichao wrote:
>>> Hi Chao,
>>>
>>> On 2018/1/21 10:34, Chao Yu wrote:
>>>> Hi Weichao,
>>>>
>>>> On 2018/1/20 23:50, guoweichao wrote:
>>>>> Hi Chao,
>>>>>
>>>>> Yes, it is exactly what I mean.
>>>>> It seems that F2FS has no responsibility to cover hardware problems.
>>>>> However, file systems usually choose redundancy for super block fault tolerance.
>>>>> So I think we actually have considered some external errors when designing a file system.
>>>>> Our dual checkpoint mechanism is mainly designed to keep at least one stable
>>>>> CP pack while creating a new one in case of SPO. And it has fault tolerant effects.
>>>>> As CP pack is also very critical to F2FS, why not make checkpoint more robustness
>>>>
>>>> I think you're trying to change basic design of A/B upgrade system to A/B/C one,
>>>> which can keep always two checkpoint valid. There will be no simple modification
>>>> to implement that one, in where we should cover not only prefree case but also
>>>> SSR case.
>>>>
>>> Nope, I do not intent to add another checkpoint. My main idea is reusing the invalid blocks
>>> after two checkpoints are recorded them as invalid. We could mark or record the blocks that
>>> are free to one checkpoint and valid to the other one as PARTIAL_FREE. And when set prefree
>>> segments as free or use invalid blocks in SSR mode, just igore PARTIAL_FREE blocks. If there
>>> is no other blocks can be used, just write a new checkpoint.
>>
>> Actually, your implementation looks like a variant of A/B/C upgrade system
>> (or A/B upgrade system), as checkpoint C (or inflight A/B) uses main area
>> separated from checkpoint A/B, and its inflight dirty metadata will persist
>> into checkpoint A or B since we haven't actually allocated checkpoint C
>> area during mkfs.
>>
>> Anyway, I don't think we have a strong reason to implement this one.
>>
>>>
>>>> IMO, the biggest problem there is available space, since in checkpoint C, we can
>>>> only use invalid blocks in both checkpoint A and B, so in some cases there will
>>>> almost be no valid space we can use during allocation, result in frequently
>>>> checkpoint.
>>>>
>>>> IMO, what we can do is trying to keep last valid checkpoint being integrity as
>>>> possible as we can. One way is that we can add mirror or parity for the
>>>> checkpoint which can help to do recovery once checkpoint is corrupted. At
>>>> least, I hope that with it in debug version we can help hardware staff to fix
>>>> their issue instead of wasting much time to troubleshoot filesystem issue.
>>>>
>>> Yes, I agree. Adding a backup for the latest checkpoint in debug version is OK. Sometimes,
>>
>> Parity will be better due to light overhead?
> Yes, backup may also cause disk layout changes. How about adding a parity block for the blocks

Right,

> in a checkpoint similar to RAID-5 or other erasure code methods?

Yes, I think we can.

Thanks,

> 
> Thanks,
> 
>>
>> Thanks,
>>
>>> hardware issues are hard to uncover. We should try to separate them from F2FS.
>>>
>>> Thanks,
>>>
>>>> Thanks,
>>>>
>>>>> with simple modification in current design and little overhead except for FG_GC.
>>>>> Of cause, keep unchanged is OK. I just want to discuss this proposal. :)
>>>>>
>>>>> Thanks,
>>>>> *From:*Chao Yu
>>>>> *To:*guoweichao,jaegeuk@kernel.org,
>>>>> *Cc:*linux-f2fs-devel@lists.sourceforge.net,linux-fsdevel@vger.kernel.org,heyunlei,
>>>>> *Date:*2018-01-20 15:43:23
>>>>> *Subject:*Re: [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other
>>>>>
>>>>> Hi Weichao,
>>>>>
>>>>> On 2018/1/20 7:29, Weichao Guo wrote:
>>>>>> Currently, we set prefree segments as free ones after writing
>>>>>> a checkpoint, then believe the segments could be used safely.
>>>>>> However, if a sudden power-off coming and the newer checkpoint
>>>>>> corrupted due to hardware issues at the same time, we will try
>>>>>> to use the old checkpoint and may face an inconsistent file
>>>>>> system status.
>>>>>
>>>>> IIUC, you mean:
>>>>>
>>>>> 1. write nodes into segment x;
>>>>> 2. write checkpoint A;
>>>>> 3. remove nodes in segment x;
>>>>> 4. write checkpoint B;
>>>>> 5. issue discard or write datas into segment x;
>>>>> 6. sudden power-cut
>>>>>
>>>>> But after reboot, we found checkpoint B is corrupted due to hardware, and
>>>>> then start to use checkpoint A, but nodes in segment x recorded as valid
>>>>> data in checkpoint A has been overcovered in step 5), so we will encounter
>>>>> inconsistent meta data, right?
>>>>>
>>>>> Thanks,
>>>>>
>>>>>>
>>>>>> How about add an PRE2 status for prefree segments, and make
>>>>>> sure the segments could be used safely to both checkpoints?
>>>>>> Or any better solutions? Or this is not a problem?
>>>>>>
>>>>>> Look forward to your comments!
>>>>>>
>>>>>> Signed-off-by: Weichao Guo <guoweichao@huawei.com>
>>>>>> ---
>>>>>>  fs/f2fs/gc.c      | 11 +++++++++--
>>>>>>  fs/f2fs/segment.c | 21 ++++++++++++++++++---
>>>>>>  fs/f2fs/segment.h |  6 ++++++
>>>>>>  3 files changed, 33 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>>>>>> index 33e7969..153e3ea 100644
>>>>>> --- a/fs/f2fs/gc.c
>>>>>> +++ b/fs/f2fs/gc.c
>>>>>> @@ -1030,7 +1030,12 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>>    * threshold, we can make them free by checkpoint. Then, we
>>>>>>    * secure free segments which doesn't need fggc any more.
>>>>>>    */
>>>>>> - if (prefree_segments(sbi)) {
>>>>>> + if (prefree_segments(sbi) || prefree2_segments(sbi)) {
>>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>>> + if (ret)
>>>>>> + goto stop;
>>>>>> + }
>>>>>> + if (has_not_enough_free_secs(sbi, 0, 0) && prefree2_segments(sbi)) {
>>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>>>   if (ret)
>>>>>>   goto stop;
>>>>>> @@ -1063,8 +1068,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
>>>>>>   goto gc_more;
>>>>>>   }
>>>>>>  
>>>>>> - if (gc_type == FG_GC)
>>>>>> + if (gc_type == FG_GC) {
>>>>>> + ret = write_checkpoint(sbi, &cpc);
>>>>>>   ret = write_checkpoint(sbi, &cpc);
>>>>>> + }
>>>>>>   }
>>>>>>  stop:
>>>>>>   SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
>>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>>> index 2e8e054d..9dec445 100644
>>>>>> --- a/fs/f2fs/segment.c
>>>>>> +++ b/fs/f2fs/segment.c
>>>>>> @@ -1606,7 +1606,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
>>>>>>   unsigned int segno;
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>> - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE2], MAIN_SEGS(sbi))
>>>>>>   __set_test_and_free(sbi, segno);
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  }
>>>>>> @@ -1617,13 +1617,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   struct list_head *head = &dcc->entry_list;
>>>>>>   struct discard_entry *entry, *this;
>>>>>>   struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
>>>>>> - unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
>>>>>> + unsigned long *prefree_map;
>>>>>>   unsigned int start = 0, end = -1;
>>>>>>   unsigned int secno, start_segno;
>>>>>>   bool force = (cpc->reason & CP_DISCARD);
>>>>>> + int phase = 0;
>>>>>> + enum dirty_type dirty_type = PRE2;
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>> +next_step:
>>>>>> + prefree_map = dirty_i->dirty_segmap[dirty_type];
>>>>>>   while (1) {
>>>>>>   int i;
>>>>>>   start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
>>>>>> @@ -1635,7 +1639,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   for (i = start; i < end; i++)
>>>>>>   clear_bit(i, prefree_map);
>>>>>>  
>>>>>> - dirty_i->nr_dirty[PRE] -= end - start;
>>>>>> + dirty_i->nr_dirty[dirty_type] -= end - start;
>>>>>>  
>>>>>>   if (!test_opt(sbi, DISCARD))
>>>>>>   continue;
>>>>>> @@ -1663,6 +1667,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>>>>   else
>>>>>>   end = start - 1;
>>>>>>   }
>>>>>> + if (phase == 0) {
>>>>>> + /* status change: PRE -> PRE2 */
>>>>>> + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
>>>>>> + if (!test_and_set_bit(segno, prefree_map))
>>>>>> + dirty_i->nr_dirty[PRE2]++;
>>>>>> +
>>>>>> + phase = 1;
>>>>>> + dirty_type = PRE;
>>>>>> + goto next_step;
>>>>>> + }
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>>   /* send small discards */
>>>>>> @@ -2245,6 +2259,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
>>>>>>  
>>>>>>   mutex_lock(&dirty_i->seglist_lock);
>>>>>>   __remove_dirty_segment(sbi, new_segno, PRE);
>>>>>> + __remove_dirty_segment(sbi, new_segno, PRE2);
>>>>>>   __remove_dirty_segment(sbi, new_segno, DIRTY);
>>>>>>   mutex_unlock(&dirty_i->seglist_lock);
>>>>>>  
>>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>>> index 71a2aaa..f702726 100644
>>>>>> --- a/fs/f2fs/segment.h
>>>>>> +++ b/fs/f2fs/segment.h
>>>>>> @@ -263,6 +263,7 @@ enum dirty_type {
>>>>>>   DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
>>>>>>   DIRTY, /* to count # of dirty segments */
>>>>>>   PRE, /* to count # of entirely obsolete segments */
>>>>>> + PRE2, /* to count # of the segments free to one checkpoint but obsolete to the other checkpoint */
>>>>>>   NR_DIRTY_TYPE
>>>>>>  };
>>>>>>  
>>>>>> @@ -477,6 +478,11 @@ static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
>>>>>>   return DIRTY_I(sbi)->nr_dirty[PRE];
>>>>>>  }
>>>>>>  
>>>>>> +static inline unsigned int prefree2_segments(struct f2fs_sb_info *sbi)
>>>>>> +{
>>>>>> + return DIRTY_I(sbi)->nr_dirty[PRE2];
>>>>>> +}
>>>>>> +
>>>>>>  static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
>>>>>>  {
>>>>>>   return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
>>>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>> .
>>
> 
> 
> .
> 


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2018-01-22  8:28 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-19 23:29 [PATCH RFC] f2fs: add PRE2 to mark segments free to one checkpoint but obsolete to the other Weichao Guo
2018-01-19 23:29 ` Weichao Guo
2018-01-19 15:47 ` guoweichao
2018-01-19 15:47   ` guoweichao
2018-01-20  3:48   ` Gao Xiang via Linux-f2fs-devel
2018-01-21  3:27     ` Gao Xiang via Linux-f2fs-devel
2018-01-22  3:01       ` guoweichao
2018-01-22  5:06         ` Gaoxiang (OS)
2018-01-20  7:43 ` Chao Yu
2018-01-21  2:34   ` Chao Yu
2018-01-21  2:34     ` Chao Yu
2018-01-21  3:12     ` Gao Xiang via Linux-f2fs-devel
2018-01-22  8:25       ` [f2fs-dev] " Chao Yu
2018-01-22  8:25         ` Chao Yu
2018-01-22  2:40     ` guoweichao
2018-01-22  2:40       ` guoweichao
2018-01-22  6:19       ` Chao Yu
2018-01-22  8:03         ` guoweichao
2018-01-22  8:27           ` Chao Yu
2018-01-22  8:27             ` Chao Yu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.