* [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc @ 2018-07-30 1:32 Jaegeuk Kim 2018-07-30 3:29 ` Chao Yu 2018-08-04 22:29 ` [PATCH v2] " Jaegeuk Kim 0 siblings, 2 replies; 21+ messages in thread From: Jaegeuk Kim @ 2018-07-30 1:32 UTC (permalink / raw) To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- fs/f2fs/gc.c | 22 ++++++++++++----- fs/f2fs/segment.c | 5 +++- fs/f2fs/segment.h | 2 +- 5 files changed, 51 insertions(+), 41 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a9447c7d6570..50349780001b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 78c1bd6b8497..2b7d26ebb294 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_unlock_op(sbi); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) goto out_unlock; - truncate_pagecache(inode, offset); - - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) goto out_unlock; @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, len >> F2FS_BLKSIZE_BITS, false); @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, else if (dst_osize != dst->i_size) f2fs_i_size_write(dst, dst_osize); } + if (src != dst) + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); f2fs_unlock_op(sbi); out_unlock: - if (src != dst) { - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e352fbd33848..cac317e37306 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3662e1f429b4..15b3b095fd58 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); mutex_lock(&fi->inmem_lock); err = __f2fs_commit_inmem_pages(inode); @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 1:32 [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc Jaegeuk Kim @ 2018-07-30 3:29 ` Chao Yu 2018-07-30 4:18 ` Jaegeuk Kim 2018-08-04 22:29 ` [PATCH v2] " Jaegeuk Kim 1 sibling, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-07-30 3:29 UTC (permalink / raw) To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel On 2018/7/30 9:32, Jaegeuk Kim wrote: > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > If it hits the miximum retrials in GC, let's give a chance to release > gc_mutex for a short time in order not to go into live lock in the worst > case. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > --- > fs/f2fs/f2fs.h | 1 + > fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- > fs/f2fs/gc.c | 22 ++++++++++++----- > fs/f2fs/segment.c | 5 +++- > fs/f2fs/segment.h | 2 +- > 5 files changed, 51 insertions(+), 41 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index a9447c7d6570..50349780001b 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > unsigned int gc_mode; /* current GC state */ > /* for skip statistic */ > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > /* threshold for gc trials on pinned files */ > u64 gc_pin_file_threshold; > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 78c1bd6b8497..2b7d26ebb294 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, > return ret; > } > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > { > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > + pgoff_t start = offset >> PAGE_SHIFT; > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > int ret; > > f2fs_balance_fs(sbi, true); > @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > f2fs_drop_extent_tree(inode); > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + > f2fs_unlock_op(sbi); > return ret; > } > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > { > - pgoff_t pg_start, pg_end; > loff_t new_size; > int ret; > > @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - pg_start = offset >> PAGE_SHIFT; > - pg_end = (offset + len) >> PAGE_SHIFT; > - > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > goto out_unlock; > > - truncate_pagecache(inode, offset); > - > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > + ret = f2fs_do_collapse(inode, offset, len); > if (ret) > goto out_unlock; > > @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out_unlock: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_balance_fs(sbi, true); > > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > if (ret) > @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > goto out; > > - truncate_pagecache(inode, offset); > - > pg_start = offset >> PAGE_SHIFT; > pg_end = (offset + len) >> PAGE_SHIFT; > delta = pg_end - pg_start; > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > + > while (!ret && idx > pg_start) { > nr = idx - pg_start; > if (nr > delta) > @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > idx + delta, nr, false); > f2fs_unlock_op(sbi); > } > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > /* write out all moved pages, if possible */ > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > inode_lock(inode); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); After removing lock here, how can we handle below condition: commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 Author: Chao Yu <yuchao0@huawei.com> Date: Tue Apr 17 17:51:28 2018 +0800 f2fs: fix race in between GC and atomic open Thread GC thread - f2fs_ioc_start_atomic_write - get_dirty_pages - filemap_write_and_wait_range - f2fs_gc - do_garbage_collect - gc_data_segment - move_data_page - f2fs_is_atomic_file - set_page_dirty - set_inode_flag(, FI_ATOMIC_FILE) Dirty data page can still be generated by GC in race condition as above call stack. This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write to avoid such race. Thanks, > - > if (f2fs_is_atomic_file(inode)) { > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > ret = -EINVAL; > @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > stat_inc_atomic_write(inode); > stat_update_max_atomic_write(inode); > out: > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > if (ret) > return ret; > > - inode_lock(inode); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + inode_lock(inode); > > if (f2fs_is_volatile_file(inode)) { > ret = -EINVAL; > @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > ret = -EINVAL; > } > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > } > > inode_lock(src); > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > if (src != dst) { > ret = -EBUSY; > if (!inode_trylock(dst)) > goto out; > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > - inode_unlock(dst); > - goto out; > - } > } > > ret = -EINVAL; > @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > f2fs_balance_fs(sbi, true); > f2fs_lock_op(sbi); > + > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > + if (src != dst) { > + ret = -EBUSY; > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > + goto out_src; > + } > + > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > pos_out >> F2FS_BLKSIZE_BITS, > len >> F2FS_BLKSIZE_BITS, false); > @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > else if (dst_osize != dst->i_size) > f2fs_i_size_write(dst, dst_osize); > } > + if (src != dst) > + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > +out_src: > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > f2fs_unlock_op(sbi); > out_unlock: > - if (src != dst) { > - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > + if (src != dst) > inode_unlock(dst); > - } > out: > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > inode_unlock(src); > return ret; > } > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > index e352fbd33848..cac317e37306 100644 > --- a/fs/f2fs/gc.c > +++ b/fs/f2fs/gc.c > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > if (!down_write_trylock( > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > iput(inode); > + sbi->skipped_gc_rwsem++; > continue; > } > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > continue; > if (!down_write_trylock( > &fi->i_gc_rwsem[WRITE])) { > + sbi->skipped_gc_rwsem++; > up_write(&fi->i_gc_rwsem[READ]); > continue; > } > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > prefree_segments(sbi)); > > cpc.reason = __get_cp_reason(sbi); > + sbi->skipped_gc_rwsem = 0; > gc_more: > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > ret = -EINVAL; > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > total_freed += seg_freed; > > if (gc_type == FG_GC) { > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > + sbi->skipped_gc_rwsem) > skipped_round++; > last_skipped = sbi->skipped_atomic_files[FG_GC]; > round++; > @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > if (gc_type == FG_GC) > sbi->cur_victim_sec = NULL_SEGNO; > > - if (!sync) { > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > - skipped_round * 2 >= round) > - f2fs_drop_inmem_pages_all(sbi, true); > + if (sync) > + goto stop; > + > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > + if (skipped_round <= MAX_SKIP_GC_COUNT || > + skipped_round * 2 < round) { > segno = NULL_SEGNO; > goto gc_more; > } > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > + f2fs_drop_inmem_pages_all(sbi, true); > + segno = NULL_SEGNO; > + goto gc_more; > + } > if (gc_type == FG_GC) > ret = f2fs_write_checkpoint(sbi, &cpc); > } > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > index 3662e1f429b4..15b3b095fd58 100644 > --- a/fs/f2fs/segment.c > +++ b/fs/f2fs/segment.c > @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > struct f2fs_inode_info *fi = F2FS_I(inode); > int err; > > - f2fs_balance_fs(sbi, true); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > + > f2fs_lock_op(sbi); > > set_inode_flag(inode, FI_ATOMIC_COMMIT); > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > mutex_lock(&fi->inmem_lock); > err = __f2fs_commit_inmem_pages(inode); > @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > mutex_unlock(&fi->inmem_lock); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > f2fs_unlock_op(sbi); > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > index 50495515f0a0..b3d9e317ff0c 100644 > --- a/fs/f2fs/segment.h > +++ b/fs/f2fs/segment.h > @@ -215,7 +215,7 @@ struct segment_allocation { > #define IS_DUMMY_WRITTEN_PAGE(page) \ > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > -#define MAX_SKIP_ATOMIC_COUNT 16 > +#define MAX_SKIP_GC_COUNT 16 > > struct inmem_pages { > struct list_head list; > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 3:29 ` Chao Yu @ 2018-07-30 4:18 ` Jaegeuk Kim 2018-07-30 6:36 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-07-30 4:18 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 07/30, Chao Yu wrote: > On 2018/7/30 9:32, Jaegeuk Kim wrote: > > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > > > If it hits the miximum retrials in GC, let's give a chance to release > > gc_mutex for a short time in order not to go into live lock in the worst > > case. > > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > > --- > > fs/f2fs/f2fs.h | 1 + > > fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- > > fs/f2fs/gc.c | 22 ++++++++++++----- > > fs/f2fs/segment.c | 5 +++- > > fs/f2fs/segment.h | 2 +- > > 5 files changed, 51 insertions(+), 41 deletions(-) > > > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > > index a9447c7d6570..50349780001b 100644 > > --- a/fs/f2fs/f2fs.h > > +++ b/fs/f2fs/f2fs.h > > @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > > unsigned int gc_mode; /* current GC state */ > > /* for skip statistic */ > > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > > > /* threshold for gc trials on pinned files */ > > u64 gc_pin_file_threshold; > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > > index 78c1bd6b8497..2b7d26ebb294 100644 > > --- a/fs/f2fs/file.c > > +++ b/fs/f2fs/file.c > > @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, > > return ret; > > } > > > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > > { > > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + pgoff_t start = offset >> PAGE_SHIFT; > > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > > int ret; > > > > f2fs_balance_fs(sbi, true); > > @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > > > f2fs_drop_extent_tree(inode); > > > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + truncate_pagecache(inode, offset); > > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + > > f2fs_unlock_op(sbi); > > return ret; > > } > > > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > { > > - pgoff_t pg_start, pg_end; > > loff_t new_size; > > int ret; > > > > @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > return ret; > > > > - pg_start = offset >> PAGE_SHIFT; > > - pg_end = (offset + len) >> PAGE_SHIFT; > > - > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > /* write out all dirty pages from offset */ > > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > if (ret) > > goto out_unlock; > > > > - truncate_pagecache(inode, offset); > > - > > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > > + ret = f2fs_do_collapse(inode, offset, len); > > if (ret) > > goto out_unlock; > > > > @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out_unlock: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > > > f2fs_balance_fs(sbi, true); > > > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > > if (ret) > > @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > goto out; > > > > - truncate_pagecache(inode, offset); > > - > > pg_start = offset >> PAGE_SHIFT; > > pg_end = (offset + len) >> PAGE_SHIFT; > > delta = pg_end - pg_start; > > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + truncate_pagecache(inode, offset); > > + > > while (!ret && idx > pg_start) { > > nr = idx - pg_start; > > if (nr > delta) > > @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > idx + delta, nr, false); > > f2fs_unlock_op(sbi); > > } > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > /* write out all moved pages, if possible */ > > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > > > inode_lock(inode); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > After removing lock here, how can we handle below condition: > > commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 > Author: Chao Yu <yuchao0@huawei.com> > Date: Tue Apr 17 17:51:28 2018 +0800 > > f2fs: fix race in between GC and atomic open > > Thread GC thread > - f2fs_ioc_start_atomic_write > - get_dirty_pages > - filemap_write_and_wait_range > - f2fs_gc > - do_garbage_collect > - gc_data_segment > - move_data_page > - f2fs_is_atomic_file > - set_page_dirty > - set_inode_flag(, FI_ATOMIC_FILE) > > Dirty data page can still be generated by GC in race condition as > above call stack. > > This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write > to avoid such race. "f2fs: don't allow any writes on aborted atomic writes" disallows any writes on atomic file which has the revoking flag. So, this won't happen. In GC, f2fs_is_atomic_file won't make the page dirty. WDYT? Thanks, > > Thanks, > > > - > > if (f2fs_is_atomic_file(inode)) { > > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > > ret = -EINVAL; > > @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > stat_inc_atomic_write(inode); > > stat_update_max_atomic_write(inode); > > out: > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > if (ret) > > return ret; > > > > - inode_lock(inode); > > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + inode_lock(inode); > > > > if (f2fs_is_volatile_file(inode)) { > > ret = -EINVAL; > > @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > > ret = -EINVAL; > > } > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > } > > > > inode_lock(src); > > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > if (src != dst) { > > ret = -EBUSY; > > if (!inode_trylock(dst)) > > goto out; > > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > > - inode_unlock(dst); > > - goto out; > > - } > > } > > > > ret = -EINVAL; > > @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > > > f2fs_balance_fs(sbi, true); > > f2fs_lock_op(sbi); > > + > > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > + if (src != dst) { > > + ret = -EBUSY; > > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > > + goto out_src; > > + } > > + > > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > > pos_out >> F2FS_BLKSIZE_BITS, > > len >> F2FS_BLKSIZE_BITS, false); > > @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > else if (dst_osize != dst->i_size) > > f2fs_i_size_write(dst, dst_osize); > > } > > + if (src != dst) > > + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > > +out_src: > > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > f2fs_unlock_op(sbi); > > out_unlock: > > - if (src != dst) { > > - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > > + if (src != dst) > > inode_unlock(dst); > > - } > > out: > > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > inode_unlock(src); > > return ret; > > } > > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > > index e352fbd33848..cac317e37306 100644 > > --- a/fs/f2fs/gc.c > > +++ b/fs/f2fs/gc.c > > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > if (!down_write_trylock( > > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > > iput(inode); > > + sbi->skipped_gc_rwsem++; > > continue; > > } > > > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > continue; > > if (!down_write_trylock( > > &fi->i_gc_rwsem[WRITE])) { > > + sbi->skipped_gc_rwsem++; > > up_write(&fi->i_gc_rwsem[READ]); > > continue; > > } > > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > prefree_segments(sbi)); > > > > cpc.reason = __get_cp_reason(sbi); > > + sbi->skipped_gc_rwsem = 0; > > gc_more: > > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > > ret = -EINVAL; > > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > total_freed += seg_freed; > > > > if (gc_type == FG_GC) { > > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > > + sbi->skipped_gc_rwsem) > > skipped_round++; > > last_skipped = sbi->skipped_atomic_files[FG_GC]; > > round++; > > @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > if (gc_type == FG_GC) > > sbi->cur_victim_sec = NULL_SEGNO; > > > > - if (!sync) { > > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > > - skipped_round * 2 >= round) > > - f2fs_drop_inmem_pages_all(sbi, true); > > + if (sync) > > + goto stop; > > + > > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > + if (skipped_round <= MAX_SKIP_GC_COUNT || > > + skipped_round * 2 < round) { > > segno = NULL_SEGNO; > > goto gc_more; > > } > > > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > > + f2fs_drop_inmem_pages_all(sbi, true); > > + segno = NULL_SEGNO; > > + goto gc_more; > > + } > > if (gc_type == FG_GC) > > ret = f2fs_write_checkpoint(sbi, &cpc); > > } > > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > > index 3662e1f429b4..15b3b095fd58 100644 > > --- a/fs/f2fs/segment.c > > +++ b/fs/f2fs/segment.c > > @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > > struct f2fs_inode_info *fi = F2FS_I(inode); > > int err; > > > > - f2fs_balance_fs(sbi, true); > > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > + > > f2fs_lock_op(sbi); > > > > set_inode_flag(inode, FI_ATOMIC_COMMIT); > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > mutex_lock(&fi->inmem_lock); > > err = __f2fs_commit_inmem_pages(inode); > > @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > > spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > > mutex_unlock(&fi->inmem_lock); > > > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > > > f2fs_unlock_op(sbi); > > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > > index 50495515f0a0..b3d9e317ff0c 100644 > > --- a/fs/f2fs/segment.h > > +++ b/fs/f2fs/segment.h > > @@ -215,7 +215,7 @@ struct segment_allocation { > > #define IS_DUMMY_WRITTEN_PAGE(page) \ > > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > > > -#define MAX_SKIP_ATOMIC_COUNT 16 > > +#define MAX_SKIP_GC_COUNT 16 > > > > struct inmem_pages { > > struct list_head list; > > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 4:18 ` Jaegeuk Kim @ 2018-07-30 6:36 ` Chao Yu 2018-07-30 9:08 ` Jaegeuk Kim 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-07-30 6:36 UTC (permalink / raw) To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel On 2018/7/30 12:18, Jaegeuk Kim wrote: > On 07/30, Chao Yu wrote: >> On 2018/7/30 9:32, Jaegeuk Kim wrote: >>> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of >>> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. >>> >>> If it hits the miximum retrials in GC, let's give a chance to release >>> gc_mutex for a short time in order not to go into live lock in the worst >>> case. >>> >>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> >>> --- >>> fs/f2fs/f2fs.h | 1 + >>> fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- >>> fs/f2fs/gc.c | 22 ++++++++++++----- >>> fs/f2fs/segment.c | 5 +++- >>> fs/f2fs/segment.h | 2 +- >>> 5 files changed, 51 insertions(+), 41 deletions(-) >>> >>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >>> index a9447c7d6570..50349780001b 100644 >>> --- a/fs/f2fs/f2fs.h >>> +++ b/fs/f2fs/f2fs.h >>> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { >>> unsigned int gc_mode; /* current GC state */ >>> /* for skip statistic */ >>> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ >>> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ >>> >>> /* threshold for gc trials on pinned files */ >>> u64 gc_pin_file_threshold; >>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >>> index 78c1bd6b8497..2b7d26ebb294 100644 >>> --- a/fs/f2fs/file.c >>> +++ b/fs/f2fs/file.c >>> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, >>> return ret; >>> } >>> >>> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) >>> { >>> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); >>> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>> + pgoff_t start = offset >> PAGE_SHIFT; >>> + pgoff_t end = (offset + len) >> PAGE_SHIFT; >>> int ret; >>> >>> f2fs_balance_fs(sbi, true); >>> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>> >>> f2fs_drop_extent_tree(inode); >>> >>> + /* avoid gc operation during block exchange */ >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> + truncate_pagecache(inode, offset); >>> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> + >>> f2fs_unlock_op(sbi); >>> return ret; >>> } >>> >>> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>> { >>> - pgoff_t pg_start, pg_end; >>> loff_t new_size; >>> int ret; >>> >>> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>> if (ret) >>> return ret; >>> >>> - pg_start = offset >> PAGE_SHIFT; >>> - pg_end = (offset + len) >> PAGE_SHIFT; >>> - >>> - /* avoid gc operation during block exchange */ >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> - >>> down_write(&F2FS_I(inode)->i_mmap_sem); >>> /* write out all dirty pages from offset */ >>> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>> if (ret) >>> goto out_unlock; >>> >>> - truncate_pagecache(inode, offset); >>> - >>> - ret = f2fs_do_collapse(inode, pg_start, pg_end); >>> + ret = f2fs_do_collapse(inode, offset, len); >>> if (ret) >>> goto out_unlock; >>> >>> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>> f2fs_i_size_write(inode, new_size); >>> out_unlock: >>> up_write(&F2FS_I(inode)->i_mmap_sem); >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> return ret; >>> } >>> >>> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>> >>> f2fs_balance_fs(sbi, true); >>> >>> - /* avoid gc operation during block exchange */ >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> - >>> down_write(&F2FS_I(inode)->i_mmap_sem); >>> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); >>> if (ret) >>> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>> if (ret) >>> goto out; >>> >>> - truncate_pagecache(inode, offset); >>> - >>> pg_start = offset >> PAGE_SHIFT; >>> pg_end = (offset + len) >> PAGE_SHIFT; >>> delta = pg_end - pg_start; >>> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>> >>> + /* avoid gc operation during block exchange */ >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> + truncate_pagecache(inode, offset); >>> + >>> while (!ret && idx > pg_start) { >>> nr = idx - pg_start; >>> if (nr > delta) >>> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>> idx + delta, nr, false); >>> f2fs_unlock_op(sbi); >>> } >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> >>> /* write out all moved pages, if possible */ >>> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>> f2fs_i_size_write(inode, new_size); >>> out: >>> up_write(&F2FS_I(inode)->i_mmap_sem); >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> return ret; >>> } >>> >>> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >>> >>> inode_lock(inode); >>> >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> >> After removing lock here, how can we handle below condition: >> >> commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 >> Author: Chao Yu <yuchao0@huawei.com> >> Date: Tue Apr 17 17:51:28 2018 +0800 >> >> f2fs: fix race in between GC and atomic open >> >> Thread GC thread >> - f2fs_ioc_start_atomic_write >> - get_dirty_pages >> - filemap_write_and_wait_range >> - f2fs_gc >> - do_garbage_collect >> - gc_data_segment >> - move_data_page >> - f2fs_is_atomic_file >> - set_page_dirty >> - set_inode_flag(, FI_ATOMIC_FILE) >> >> Dirty data page can still be generated by GC in race condition as >> above call stack. >> >> This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write >> to avoid such race. > > "f2fs: don't allow any writes on aborted atomic writes" disallows any writes > on atomic file which has the revoking flag. So, this won't happen. In GC, Hmmm... In above condition, it's not related to FI_ATOMIC_REVOKE_REQUEST flag since we do not drop any inmem pages for atomic file. That patch was trying to eliminate a hole which exists in between filemap_write_and_wait_range and set_inode_flag(, FI_ATOMIC_FILE), where GC can still dirty page in the inode, it can pollute isolation of database transaction, so that is why we need this lock. Thanks, > f2fs_is_atomic_file won't make the page dirty. WDYT? > > Thanks, > > >> >> Thanks, >> >>> - >>> if (f2fs_is_atomic_file(inode)) { >>> if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) >>> ret = -EINVAL; >>> @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >>> stat_inc_atomic_write(inode); >>> stat_update_max_atomic_write(inode); >>> out: >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> inode_unlock(inode); >>> mnt_drop_write_file(filp); >>> return ret; >>> @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >>> if (ret) >>> return ret; >>> >>> - inode_lock(inode); >>> + f2fs_balance_fs(F2FS_I_SB(inode), true); >>> >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> + inode_lock(inode); >>> >>> if (f2fs_is_volatile_file(inode)) { >>> ret = -EINVAL; >>> @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >>> clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); >>> ret = -EINVAL; >>> } >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> inode_unlock(inode); >>> mnt_drop_write_file(filp); >>> return ret; >>> @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>> } >>> >>> inode_lock(src); >>> - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>> if (src != dst) { >>> ret = -EBUSY; >>> if (!inode_trylock(dst)) >>> goto out; >>> - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { >>> - inode_unlock(dst); >>> - goto out; >>> - } >>> } >>> >>> ret = -EINVAL; >>> @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>> >>> f2fs_balance_fs(sbi, true); >>> f2fs_lock_op(sbi); >>> + >>> + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>> + if (src != dst) { >>> + ret = -EBUSY; >>> + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) >>> + goto out_src; >>> + } >>> + >>> ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, >>> pos_out >> F2FS_BLKSIZE_BITS, >>> len >> F2FS_BLKSIZE_BITS, false); >>> @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>> else if (dst_osize != dst->i_size) >>> f2fs_i_size_write(dst, dst_osize); >>> } >>> + if (src != dst) >>> + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >>> +out_src: >>> + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>> f2fs_unlock_op(sbi); >>> out_unlock: >>> - if (src != dst) { >>> - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >>> + if (src != dst) >>> inode_unlock(dst); >>> - } >>> out: >>> - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>> inode_unlock(src); >>> return ret; >>> } >>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c >>> index e352fbd33848..cac317e37306 100644 >>> --- a/fs/f2fs/gc.c >>> +++ b/fs/f2fs/gc.c >>> @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >>> if (!down_write_trylock( >>> &F2FS_I(inode)->i_gc_rwsem[WRITE])) { >>> iput(inode); >>> + sbi->skipped_gc_rwsem++; >>> continue; >>> } >>> >>> @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >>> continue; >>> if (!down_write_trylock( >>> &fi->i_gc_rwsem[WRITE])) { >>> + sbi->skipped_gc_rwsem++; >>> up_write(&fi->i_gc_rwsem[READ]); >>> continue; >>> } >>> @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> prefree_segments(sbi)); >>> >>> cpc.reason = __get_cp_reason(sbi); >>> + sbi->skipped_gc_rwsem = 0; >>> gc_more: >>> if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { >>> ret = -EINVAL; >>> @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> total_freed += seg_freed; >>> >>> if (gc_type == FG_GC) { >>> - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) >>> + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || >>> + sbi->skipped_gc_rwsem) >>> skipped_round++; >>> last_skipped = sbi->skipped_atomic_files[FG_GC]; >>> round++; >>> @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> if (gc_type == FG_GC) >>> sbi->cur_victim_sec = NULL_SEGNO; >>> >>> - if (!sync) { >>> - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >>> - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && >>> - skipped_round * 2 >= round) >>> - f2fs_drop_inmem_pages_all(sbi, true); >>> + if (sync) >>> + goto stop; >>> + >>> + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >>> + if (skipped_round <= MAX_SKIP_GC_COUNT || >>> + skipped_round * 2 < round) { >>> segno = NULL_SEGNO; >>> goto gc_more; >>> } >>> >>> + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { >>> + f2fs_drop_inmem_pages_all(sbi, true); >>> + segno = NULL_SEGNO; >>> + goto gc_more; >>> + } >>> if (gc_type == FG_GC) >>> ret = f2fs_write_checkpoint(sbi, &cpc); >>> } >>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c >>> index 3662e1f429b4..15b3b095fd58 100644 >>> --- a/fs/f2fs/segment.c >>> +++ b/fs/f2fs/segment.c >>> @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) >>> struct f2fs_inode_info *fi = F2FS_I(inode); >>> int err; >>> >>> - f2fs_balance_fs(sbi, true); >>> + f2fs_balance_fs(F2FS_I_SB(inode), true); >>> + >>> f2fs_lock_op(sbi); >>> >>> set_inode_flag(inode, FI_ATOMIC_COMMIT); >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> >>> mutex_lock(&fi->inmem_lock); >>> err = __f2fs_commit_inmem_pages(inode); >>> @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) >>> spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); >>> mutex_unlock(&fi->inmem_lock); >>> >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>> clear_inode_flag(inode, FI_ATOMIC_COMMIT); >>> >>> f2fs_unlock_op(sbi); >>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h >>> index 50495515f0a0..b3d9e317ff0c 100644 >>> --- a/fs/f2fs/segment.h >>> +++ b/fs/f2fs/segment.h >>> @@ -215,7 +215,7 @@ struct segment_allocation { >>> #define IS_DUMMY_WRITTEN_PAGE(page) \ >>> (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) >>> >>> -#define MAX_SKIP_ATOMIC_COUNT 16 >>> +#define MAX_SKIP_GC_COUNT 16 >>> >>> struct inmem_pages { >>> struct list_head list; >>> > > . > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 6:36 ` Chao Yu @ 2018-07-30 9:08 ` Jaegeuk Kim 2018-07-30 9:28 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-07-30 9:08 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 07/30, Chao Yu wrote: > On 2018/7/30 12:18, Jaegeuk Kim wrote: > > On 07/30, Chao Yu wrote: > >> On 2018/7/30 9:32, Jaegeuk Kim wrote: > >>> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > >>> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > >>> > >>> If it hits the miximum retrials in GC, let's give a chance to release > >>> gc_mutex for a short time in order not to go into live lock in the worst > >>> case. > >>> > >>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > >>> --- > >>> fs/f2fs/f2fs.h | 1 + > >>> fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- > >>> fs/f2fs/gc.c | 22 ++++++++++++----- > >>> fs/f2fs/segment.c | 5 +++- > >>> fs/f2fs/segment.h | 2 +- > >>> 5 files changed, 51 insertions(+), 41 deletions(-) > >>> > >>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > >>> index a9447c7d6570..50349780001b 100644 > >>> --- a/fs/f2fs/f2fs.h > >>> +++ b/fs/f2fs/f2fs.h > >>> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > >>> unsigned int gc_mode; /* current GC state */ > >>> /* for skip statistic */ > >>> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > >>> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > >>> > >>> /* threshold for gc trials on pinned files */ > >>> u64 gc_pin_file_threshold; > >>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > >>> index 78c1bd6b8497..2b7d26ebb294 100644 > >>> --- a/fs/f2fs/file.c > >>> +++ b/fs/f2fs/file.c > >>> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, > >>> return ret; > >>> } > >>> > >>> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > >>> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > >>> { > >>> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > >>> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > >>> + pgoff_t start = offset >> PAGE_SHIFT; > >>> + pgoff_t end = (offset + len) >> PAGE_SHIFT; > >>> int ret; > >>> > >>> f2fs_balance_fs(sbi, true); > >>> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > >>> > >>> f2fs_drop_extent_tree(inode); > >>> > >>> + /* avoid gc operation during block exchange */ > >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> + truncate_pagecache(inode, offset); > >>> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> + > >>> f2fs_unlock_op(sbi); > >>> return ret; > >>> } > >>> > >>> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>> { > >>> - pgoff_t pg_start, pg_end; > >>> loff_t new_size; > >>> int ret; > >>> > >>> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>> if (ret) > >>> return ret; > >>> > >>> - pg_start = offset >> PAGE_SHIFT; > >>> - pg_end = (offset + len) >> PAGE_SHIFT; > >>> - > >>> - /* avoid gc operation during block exchange */ > >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> - > >>> down_write(&F2FS_I(inode)->i_mmap_sem); > >>> /* write out all dirty pages from offset */ > >>> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > >>> if (ret) > >>> goto out_unlock; > >>> > >>> - truncate_pagecache(inode, offset); > >>> - > >>> - ret = f2fs_do_collapse(inode, pg_start, pg_end); > >>> + ret = f2fs_do_collapse(inode, offset, len); > >>> if (ret) > >>> goto out_unlock; > >>> > >>> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>> f2fs_i_size_write(inode, new_size); > >>> out_unlock: > >>> up_write(&F2FS_I(inode)->i_mmap_sem); > >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> return ret; > >>> } > >>> > >>> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>> > >>> f2fs_balance_fs(sbi, true); > >>> > >>> - /* avoid gc operation during block exchange */ > >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> - > >>> down_write(&F2FS_I(inode)->i_mmap_sem); > >>> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > >>> if (ret) > >>> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>> if (ret) > >>> goto out; > >>> > >>> - truncate_pagecache(inode, offset); > >>> - > >>> pg_start = offset >> PAGE_SHIFT; > >>> pg_end = (offset + len) >> PAGE_SHIFT; > >>> delta = pg_end - pg_start; > >>> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > >>> > >>> + /* avoid gc operation during block exchange */ > >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> + truncate_pagecache(inode, offset); > >>> + > >>> while (!ret && idx > pg_start) { > >>> nr = idx - pg_start; > >>> if (nr > delta) > >>> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>> idx + delta, nr, false); > >>> f2fs_unlock_op(sbi); > >>> } > >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> > >>> /* write out all moved pages, if possible */ > >>> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > >>> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>> f2fs_i_size_write(inode, new_size); > >>> out: > >>> up_write(&F2FS_I(inode)->i_mmap_sem); > >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> return ret; > >>> } > >>> > >>> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > >>> > >>> inode_lock(inode); > >>> > >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >> > >> After removing lock here, how can we handle below condition: > >> > >> commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 > >> Author: Chao Yu <yuchao0@huawei.com> > >> Date: Tue Apr 17 17:51:28 2018 +0800 > >> > >> f2fs: fix race in between GC and atomic open > >> > >> Thread GC thread > >> - f2fs_ioc_start_atomic_write > >> - get_dirty_pages > >> - filemap_write_and_wait_range > >> - f2fs_gc > >> - do_garbage_collect > >> - gc_data_segment > >> - move_data_page > >> - f2fs_is_atomic_file > >> - set_page_dirty > >> - set_inode_flag(, FI_ATOMIC_FILE) > >> > >> Dirty data page can still be generated by GC in race condition as > >> above call stack. > >> > >> This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write > >> to avoid such race. > > > > "f2fs: don't allow any writes on aborted atomic writes" disallows any writes > > on atomic file which has the revoking flag. So, this won't happen. In GC, > > Hmmm... In above condition, it's not related to FI_ATOMIC_REVOKE_REQUEST flag > since we do not drop any inmem pages for atomic file. > > That patch was trying to eliminate a hole which exists in between > filemap_write_and_wait_range and set_inode_flag(, FI_ATOMIC_FILE), where GC can > still dirty page in the inode, it can pollute isolation of database transaction, > so that is why we need this lock. Ah, GC can generate any dirty pages of atomic_written data before starting another transaction, right? I think we can do - set_inode_flag() first, followed by - filemap_write_and_wait_range(). Thoughts? > > Thanks, > > > f2fs_is_atomic_file won't make the page dirty. WDYT? > > > > Thanks, > > > > > >> > >> Thanks, > >> > >>> - > >>> if (f2fs_is_atomic_file(inode)) { > >>> if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > >>> ret = -EINVAL; > >>> @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > >>> stat_inc_atomic_write(inode); > >>> stat_update_max_atomic_write(inode); > >>> out: > >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> inode_unlock(inode); > >>> mnt_drop_write_file(filp); > >>> return ret; > >>> @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > >>> if (ret) > >>> return ret; > >>> > >>> - inode_lock(inode); > >>> + f2fs_balance_fs(F2FS_I_SB(inode), true); > >>> > >>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> + inode_lock(inode); > >>> > >>> if (f2fs_is_volatile_file(inode)) { > >>> ret = -EINVAL; > >>> @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > >>> clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > >>> ret = -EINVAL; > >>> } > >>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> inode_unlock(inode); > >>> mnt_drop_write_file(filp); > >>> return ret; > >>> @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > >>> } > >>> > >>> inode_lock(src); > >>> - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > >>> if (src != dst) { > >>> ret = -EBUSY; > >>> if (!inode_trylock(dst)) > >>> goto out; > >>> - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > >>> - inode_unlock(dst); > >>> - goto out; > >>> - } > >>> } > >>> > >>> ret = -EINVAL; > >>> @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > >>> > >>> f2fs_balance_fs(sbi, true); > >>> f2fs_lock_op(sbi); > >>> + > >>> + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > >>> + if (src != dst) { > >>> + ret = -EBUSY; > >>> + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > >>> + goto out_src; > >>> + } > >>> + > >>> ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > >>> pos_out >> F2FS_BLKSIZE_BITS, > >>> len >> F2FS_BLKSIZE_BITS, false); > >>> @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > >>> else if (dst_osize != dst->i_size) > >>> f2fs_i_size_write(dst, dst_osize); > >>> } > >>> + if (src != dst) > >>> + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > >>> +out_src: > >>> + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > >>> f2fs_unlock_op(sbi); > >>> out_unlock: > >>> - if (src != dst) { > >>> - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > >>> + if (src != dst) > >>> inode_unlock(dst); > >>> - } > >>> out: > >>> - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > >>> inode_unlock(src); > >>> return ret; > >>> } > >>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > >>> index e352fbd33848..cac317e37306 100644 > >>> --- a/fs/f2fs/gc.c > >>> +++ b/fs/f2fs/gc.c > >>> @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > >>> if (!down_write_trylock( > >>> &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > >>> iput(inode); > >>> + sbi->skipped_gc_rwsem++; > >>> continue; > >>> } > >>> > >>> @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > >>> continue; > >>> if (!down_write_trylock( > >>> &fi->i_gc_rwsem[WRITE])) { > >>> + sbi->skipped_gc_rwsem++; > >>> up_write(&fi->i_gc_rwsem[READ]); > >>> continue; > >>> } > >>> @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > >>> prefree_segments(sbi)); > >>> > >>> cpc.reason = __get_cp_reason(sbi); > >>> + sbi->skipped_gc_rwsem = 0; > >>> gc_more: > >>> if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > >>> ret = -EINVAL; > >>> @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > >>> total_freed += seg_freed; > >>> > >>> if (gc_type == FG_GC) { > >>> - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > >>> + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > >>> + sbi->skipped_gc_rwsem) > >>> skipped_round++; > >>> last_skipped = sbi->skipped_atomic_files[FG_GC]; > >>> round++; > >>> @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > >>> if (gc_type == FG_GC) > >>> sbi->cur_victim_sec = NULL_SEGNO; > >>> > >>> - if (!sync) { > >>> - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > >>> - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > >>> - skipped_round * 2 >= round) > >>> - f2fs_drop_inmem_pages_all(sbi, true); > >>> + if (sync) > >>> + goto stop; > >>> + > >>> + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > >>> + if (skipped_round <= MAX_SKIP_GC_COUNT || > >>> + skipped_round * 2 < round) { > >>> segno = NULL_SEGNO; > >>> goto gc_more; > >>> } > >>> > >>> + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > >>> + f2fs_drop_inmem_pages_all(sbi, true); > >>> + segno = NULL_SEGNO; > >>> + goto gc_more; > >>> + } > >>> if (gc_type == FG_GC) > >>> ret = f2fs_write_checkpoint(sbi, &cpc); > >>> } > >>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > >>> index 3662e1f429b4..15b3b095fd58 100644 > >>> --- a/fs/f2fs/segment.c > >>> +++ b/fs/f2fs/segment.c > >>> @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > >>> struct f2fs_inode_info *fi = F2FS_I(inode); > >>> int err; > >>> > >>> - f2fs_balance_fs(sbi, true); > >>> + f2fs_balance_fs(F2FS_I_SB(inode), true); > >>> + > >>> f2fs_lock_op(sbi); > >>> > >>> set_inode_flag(inode, FI_ATOMIC_COMMIT); > >>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> > >>> mutex_lock(&fi->inmem_lock); > >>> err = __f2fs_commit_inmem_pages(inode); > >>> @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > >>> spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > >>> mutex_unlock(&fi->inmem_lock); > >>> > >>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>> clear_inode_flag(inode, FI_ATOMIC_COMMIT); > >>> > >>> f2fs_unlock_op(sbi); > >>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > >>> index 50495515f0a0..b3d9e317ff0c 100644 > >>> --- a/fs/f2fs/segment.h > >>> +++ b/fs/f2fs/segment.h > >>> @@ -215,7 +215,7 @@ struct segment_allocation { > >>> #define IS_DUMMY_WRITTEN_PAGE(page) \ > >>> (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > >>> > >>> -#define MAX_SKIP_ATOMIC_COUNT 16 > >>> +#define MAX_SKIP_GC_COUNT 16 > >>> > >>> struct inmem_pages { > >>> struct list_head list; > >>> > > > > . > > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 9:08 ` Jaegeuk Kim @ 2018-07-30 9:28 ` Chao Yu 2018-07-30 10:00 ` Jaegeuk Kim 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-07-30 9:28 UTC (permalink / raw) To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel On 2018/7/30 17:08, Jaegeuk Kim wrote: > On 07/30, Chao Yu wrote: >> On 2018/7/30 12:18, Jaegeuk Kim wrote: >>> On 07/30, Chao Yu wrote: >>>> On 2018/7/30 9:32, Jaegeuk Kim wrote: >>>>> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of >>>>> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. >>>>> >>>>> If it hits the miximum retrials in GC, let's give a chance to release >>>>> gc_mutex for a short time in order not to go into live lock in the worst >>>>> case. >>>>> >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> >>>>> --- >>>>> fs/f2fs/f2fs.h | 1 + >>>>> fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- >>>>> fs/f2fs/gc.c | 22 ++++++++++++----- >>>>> fs/f2fs/segment.c | 5 +++- >>>>> fs/f2fs/segment.h | 2 +- >>>>> 5 files changed, 51 insertions(+), 41 deletions(-) >>>>> >>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >>>>> index a9447c7d6570..50349780001b 100644 >>>>> --- a/fs/f2fs/f2fs.h >>>>> +++ b/fs/f2fs/f2fs.h >>>>> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { >>>>> unsigned int gc_mode; /* current GC state */ >>>>> /* for skip statistic */ >>>>> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ >>>>> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ >>>>> >>>>> /* threshold for gc trials on pinned files */ >>>>> u64 gc_pin_file_threshold; >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >>>>> index 78c1bd6b8497..2b7d26ebb294 100644 >>>>> --- a/fs/f2fs/file.c >>>>> +++ b/fs/f2fs/file.c >>>>> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, >>>>> return ret; >>>>> } >>>>> >>>>> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>>>> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) >>>>> { >>>>> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); >>>>> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>>>> + pgoff_t start = offset >> PAGE_SHIFT; >>>>> + pgoff_t end = (offset + len) >> PAGE_SHIFT; >>>>> int ret; >>>>> >>>>> f2fs_balance_fs(sbi, true); >>>>> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>>>> >>>>> f2fs_drop_extent_tree(inode); >>>>> >>>>> + /* avoid gc operation during block exchange */ >>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> + truncate_pagecache(inode, offset); >>>>> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); >>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> + >>>>> f2fs_unlock_op(sbi); >>>>> return ret; >>>>> } >>>>> >>>>> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>> { >>>>> - pgoff_t pg_start, pg_end; >>>>> loff_t new_size; >>>>> int ret; >>>>> >>>>> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>> if (ret) >>>>> return ret; >>>>> >>>>> - pg_start = offset >> PAGE_SHIFT; >>>>> - pg_end = (offset + len) >> PAGE_SHIFT; >>>>> - >>>>> - /* avoid gc operation during block exchange */ >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> - >>>>> down_write(&F2FS_I(inode)->i_mmap_sem); >>>>> /* write out all dirty pages from offset */ >>>>> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>>>> if (ret) >>>>> goto out_unlock; >>>>> >>>>> - truncate_pagecache(inode, offset); >>>>> - >>>>> - ret = f2fs_do_collapse(inode, pg_start, pg_end); >>>>> + ret = f2fs_do_collapse(inode, offset, len); >>>>> if (ret) >>>>> goto out_unlock; >>>>> >>>>> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>> f2fs_i_size_write(inode, new_size); >>>>> out_unlock: >>>>> up_write(&F2FS_I(inode)->i_mmap_sem); >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> return ret; >>>>> } >>>>> >>>>> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>> >>>>> f2fs_balance_fs(sbi, true); >>>>> >>>>> - /* avoid gc operation during block exchange */ >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> - >>>>> down_write(&F2FS_I(inode)->i_mmap_sem); >>>>> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); >>>>> if (ret) >>>>> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>> if (ret) >>>>> goto out; >>>>> >>>>> - truncate_pagecache(inode, offset); >>>>> - >>>>> pg_start = offset >> PAGE_SHIFT; >>>>> pg_end = (offset + len) >> PAGE_SHIFT; >>>>> delta = pg_end - pg_start; >>>>> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>>>> >>>>> + /* avoid gc operation during block exchange */ >>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> + truncate_pagecache(inode, offset); >>>>> + >>>>> while (!ret && idx > pg_start) { >>>>> nr = idx - pg_start; >>>>> if (nr > delta) >>>>> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>> idx + delta, nr, false); >>>>> f2fs_unlock_op(sbi); >>>>> } >>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> >>>>> /* write out all moved pages, if possible */ >>>>> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>>>> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>> f2fs_i_size_write(inode, new_size); >>>>> out: >>>>> up_write(&F2FS_I(inode)->i_mmap_sem); >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> return ret; >>>>> } >>>>> >>>>> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >>>>> >>>>> inode_lock(inode); >>>>> >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>> >>>> After removing lock here, how can we handle below condition: >>>> >>>> commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 >>>> Author: Chao Yu <yuchao0@huawei.com> >>>> Date: Tue Apr 17 17:51:28 2018 +0800 >>>> >>>> f2fs: fix race in between GC and atomic open >>>> >>>> Thread GC thread >>>> - f2fs_ioc_start_atomic_write >>>> - get_dirty_pages >>>> - filemap_write_and_wait_range >>>> - f2fs_gc >>>> - do_garbage_collect >>>> - gc_data_segment >>>> - move_data_page >>>> - f2fs_is_atomic_file >>>> - set_page_dirty >>>> - set_inode_flag(, FI_ATOMIC_FILE) >>>> >>>> Dirty data page can still be generated by GC in race condition as >>>> above call stack. >>>> >>>> This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write >>>> to avoid such race. >>> >>> "f2fs: don't allow any writes on aborted atomic writes" disallows any writes >>> on atomic file which has the revoking flag. So, this won't happen. In GC, >> >> Hmmm... In above condition, it's not related to FI_ATOMIC_REVOKE_REQUEST flag >> since we do not drop any inmem pages for atomic file. >> >> That patch was trying to eliminate a hole which exists in between >> filemap_write_and_wait_range and set_inode_flag(, FI_ATOMIC_FILE), where GC can >> still dirty page in the inode, it can pollute isolation of database transaction, >> so that is why we need this lock. > > Ah, GC can generate any dirty pages of atomic_written data before starting > another transaction, right? Yes, > > I think we can do > - set_inode_flag() first, followed by > - filemap_write_and_wait_range(). If there is redirty flow during filemap_write_and_wait_range, the page can be register as inmem one? f2fs_set_data_page_dirty() ... if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); return 1; } Another concern is set_inode_flag and filemap_write_and_wait_range can be reorder by CPU pipeline, so the serial should be? set_inode_flag(, FI_ATOMIC_COMMIT) smp_mb() set_inode_flag(, FI_ATOMIC_FILE) smp_mb() ret = filemap_write_and_wait_range if (ret) goto err_out; clear_inode_flag(, FI_ATOMIC_COMMIT) Is that right? Thanks, > > Thoughts? > >> >> Thanks, >> >>> f2fs_is_atomic_file won't make the page dirty. WDYT? >>> >>> Thanks, >>> >>> >>>> >>>> Thanks, >>>> >>>>> - >>>>> if (f2fs_is_atomic_file(inode)) { >>>>> if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) >>>>> ret = -EINVAL; >>>>> @@ -1736,7 +1730,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >>>>> stat_inc_atomic_write(inode); >>>>> stat_update_max_atomic_write(inode); >>>>> out: >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> inode_unlock(inode); >>>>> mnt_drop_write_file(filp); >>>>> return ret; >>>>> @@ -1754,9 +1747,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >>>>> if (ret) >>>>> return ret; >>>>> >>>>> - inode_lock(inode); >>>>> + f2fs_balance_fs(F2FS_I_SB(inode), true); >>>>> >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> + inode_lock(inode); >>>>> >>>>> if (f2fs_is_volatile_file(inode)) { >>>>> ret = -EINVAL; >>>>> @@ -1782,7 +1775,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >>>>> clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); >>>>> ret = -EINVAL; >>>>> } >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> inode_unlock(inode); >>>>> mnt_drop_write_file(filp); >>>>> return ret; >>>>> @@ -2378,15 +2370,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>>>> } >>>>> >>>>> inode_lock(src); >>>>> - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>>>> if (src != dst) { >>>>> ret = -EBUSY; >>>>> if (!inode_trylock(dst)) >>>>> goto out; >>>>> - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { >>>>> - inode_unlock(dst); >>>>> - goto out; >>>>> - } >>>>> } >>>>> >>>>> ret = -EINVAL; >>>>> @@ -2432,6 +2419,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>>>> >>>>> f2fs_balance_fs(sbi, true); >>>>> f2fs_lock_op(sbi); >>>>> + >>>>> + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>>>> + if (src != dst) { >>>>> + ret = -EBUSY; >>>>> + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) >>>>> + goto out_src; >>>>> + } >>>>> + >>>>> ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, >>>>> pos_out >> F2FS_BLKSIZE_BITS, >>>>> len >> F2FS_BLKSIZE_BITS, false); >>>>> @@ -2442,14 +2437,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >>>>> else if (dst_osize != dst->i_size) >>>>> f2fs_i_size_write(dst, dst_osize); >>>>> } >>>>> + if (src != dst) >>>>> + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >>>>> +out_src: >>>>> + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>>>> f2fs_unlock_op(sbi); >>>>> out_unlock: >>>>> - if (src != dst) { >>>>> - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >>>>> + if (src != dst) >>>>> inode_unlock(dst); >>>>> - } >>>>> out: >>>>> - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >>>>> inode_unlock(src); >>>>> return ret; >>>>> } >>>>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c >>>>> index e352fbd33848..cac317e37306 100644 >>>>> --- a/fs/f2fs/gc.c >>>>> +++ b/fs/f2fs/gc.c >>>>> @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >>>>> if (!down_write_trylock( >>>>> &F2FS_I(inode)->i_gc_rwsem[WRITE])) { >>>>> iput(inode); >>>>> + sbi->skipped_gc_rwsem++; >>>>> continue; >>>>> } >>>>> >>>>> @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >>>>> continue; >>>>> if (!down_write_trylock( >>>>> &fi->i_gc_rwsem[WRITE])) { >>>>> + sbi->skipped_gc_rwsem++; >>>>> up_write(&fi->i_gc_rwsem[READ]); >>>>> continue; >>>>> } >>>>> @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>>>> prefree_segments(sbi)); >>>>> >>>>> cpc.reason = __get_cp_reason(sbi); >>>>> + sbi->skipped_gc_rwsem = 0; >>>>> gc_more: >>>>> if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { >>>>> ret = -EINVAL; >>>>> @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>>>> total_freed += seg_freed; >>>>> >>>>> if (gc_type == FG_GC) { >>>>> - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) >>>>> + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || >>>>> + sbi->skipped_gc_rwsem) >>>>> skipped_round++; >>>>> last_skipped = sbi->skipped_atomic_files[FG_GC]; >>>>> round++; >>>>> @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>>>> if (gc_type == FG_GC) >>>>> sbi->cur_victim_sec = NULL_SEGNO; >>>>> >>>>> - if (!sync) { >>>>> - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >>>>> - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && >>>>> - skipped_round * 2 >= round) >>>>> - f2fs_drop_inmem_pages_all(sbi, true); >>>>> + if (sync) >>>>> + goto stop; >>>>> + >>>>> + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >>>>> + if (skipped_round <= MAX_SKIP_GC_COUNT || >>>>> + skipped_round * 2 < round) { >>>>> segno = NULL_SEGNO; >>>>> goto gc_more; >>>>> } >>>>> >>>>> + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { >>>>> + f2fs_drop_inmem_pages_all(sbi, true); >>>>> + segno = NULL_SEGNO; >>>>> + goto gc_more; >>>>> + } >>>>> if (gc_type == FG_GC) >>>>> ret = f2fs_write_checkpoint(sbi, &cpc); >>>>> } >>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c >>>>> index 3662e1f429b4..15b3b095fd58 100644 >>>>> --- a/fs/f2fs/segment.c >>>>> +++ b/fs/f2fs/segment.c >>>>> @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) >>>>> struct f2fs_inode_info *fi = F2FS_I(inode); >>>>> int err; >>>>> >>>>> - f2fs_balance_fs(sbi, true); >>>>> + f2fs_balance_fs(F2FS_I_SB(inode), true); >>>>> + >>>>> f2fs_lock_op(sbi); >>>>> >>>>> set_inode_flag(inode, FI_ATOMIC_COMMIT); >>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> >>>>> mutex_lock(&fi->inmem_lock); >>>>> err = __f2fs_commit_inmem_pages(inode); >>>>> @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) >>>>> spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); >>>>> mutex_unlock(&fi->inmem_lock); >>>>> >>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>> clear_inode_flag(inode, FI_ATOMIC_COMMIT); >>>>> >>>>> f2fs_unlock_op(sbi); >>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h >>>>> index 50495515f0a0..b3d9e317ff0c 100644 >>>>> --- a/fs/f2fs/segment.h >>>>> +++ b/fs/f2fs/segment.h >>>>> @@ -215,7 +215,7 @@ struct segment_allocation { >>>>> #define IS_DUMMY_WRITTEN_PAGE(page) \ >>>>> (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) >>>>> >>>>> -#define MAX_SKIP_ATOMIC_COUNT 16 >>>>> +#define MAX_SKIP_GC_COUNT 16 >>>>> >>>>> struct inmem_pages { >>>>> struct list_head list; >>>>> >>> >>> . >>> > > . > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 9:28 ` Chao Yu @ 2018-07-30 10:00 ` Jaegeuk Kim 2018-08-04 2:31 ` [f2fs-dev] " Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-07-30 10:00 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 07/30, Chao Yu wrote: > On 2018/7/30 17:08, Jaegeuk Kim wrote: > > On 07/30, Chao Yu wrote: > >> On 2018/7/30 12:18, Jaegeuk Kim wrote: > >>> On 07/30, Chao Yu wrote: > >>>> On 2018/7/30 9:32, Jaegeuk Kim wrote: > >>>>> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > >>>>> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > >>>>> > >>>>> If it hits the miximum retrials in GC, let's give a chance to release > >>>>> gc_mutex for a short time in order not to go into live lock in the worst > >>>>> case. > >>>>> > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > >>>>> --- > >>>>> fs/f2fs/f2fs.h | 1 + > >>>>> fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- > >>>>> fs/f2fs/gc.c | 22 ++++++++++++----- > >>>>> fs/f2fs/segment.c | 5 +++- > >>>>> fs/f2fs/segment.h | 2 +- > >>>>> 5 files changed, 51 insertions(+), 41 deletions(-) > >>>>> > >>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > >>>>> index a9447c7d6570..50349780001b 100644 > >>>>> --- a/fs/f2fs/f2fs.h > >>>>> +++ b/fs/f2fs/f2fs.h > >>>>> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > >>>>> unsigned int gc_mode; /* current GC state */ > >>>>> /* for skip statistic */ > >>>>> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > >>>>> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > >>>>> > >>>>> /* threshold for gc trials on pinned files */ > >>>>> u64 gc_pin_file_threshold; > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > >>>>> index 78c1bd6b8497..2b7d26ebb294 100644 > >>>>> --- a/fs/f2fs/file.c > >>>>> +++ b/fs/f2fs/file.c > >>>>> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, > >>>>> return ret; > >>>>> } > >>>>> > >>>>> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > >>>>> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > >>>>> { > >>>>> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > >>>>> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > >>>>> + pgoff_t start = offset >> PAGE_SHIFT; > >>>>> + pgoff_t end = (offset + len) >> PAGE_SHIFT; > >>>>> int ret; > >>>>> > >>>>> f2fs_balance_fs(sbi, true); > >>>>> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > >>>>> > >>>>> f2fs_drop_extent_tree(inode); > >>>>> > >>>>> + /* avoid gc operation during block exchange */ > >>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> + truncate_pagecache(inode, offset); > >>>>> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > >>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> + > >>>>> f2fs_unlock_op(sbi); > >>>>> return ret; > >>>>> } > >>>>> > >>>>> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> { > >>>>> - pgoff_t pg_start, pg_end; > >>>>> loff_t new_size; > >>>>> int ret; > >>>>> > >>>>> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> if (ret) > >>>>> return ret; > >>>>> > >>>>> - pg_start = offset >> PAGE_SHIFT; > >>>>> - pg_end = (offset + len) >> PAGE_SHIFT; > >>>>> - > >>>>> - /* avoid gc operation during block exchange */ > >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> - > >>>>> down_write(&F2FS_I(inode)->i_mmap_sem); > >>>>> /* write out all dirty pages from offset */ > >>>>> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > >>>>> if (ret) > >>>>> goto out_unlock; > >>>>> > >>>>> - truncate_pagecache(inode, offset); > >>>>> - > >>>>> - ret = f2fs_do_collapse(inode, pg_start, pg_end); > >>>>> + ret = f2fs_do_collapse(inode, offset, len); > >>>>> if (ret) > >>>>> goto out_unlock; > >>>>> > >>>>> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> f2fs_i_size_write(inode, new_size); > >>>>> out_unlock: > >>>>> up_write(&F2FS_I(inode)->i_mmap_sem); > >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> return ret; > >>>>> } > >>>>> > >>>>> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> > >>>>> f2fs_balance_fs(sbi, true); > >>>>> > >>>>> - /* avoid gc operation during block exchange */ > >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> - > >>>>> down_write(&F2FS_I(inode)->i_mmap_sem); > >>>>> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > >>>>> if (ret) > >>>>> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> if (ret) > >>>>> goto out; > >>>>> > >>>>> - truncate_pagecache(inode, offset); > >>>>> - > >>>>> pg_start = offset >> PAGE_SHIFT; > >>>>> pg_end = (offset + len) >> PAGE_SHIFT; > >>>>> delta = pg_end - pg_start; > >>>>> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > >>>>> > >>>>> + /* avoid gc operation during block exchange */ > >>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> + truncate_pagecache(inode, offset); > >>>>> + > >>>>> while (!ret && idx > pg_start) { > >>>>> nr = idx - pg_start; > >>>>> if (nr > delta) > >>>>> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> idx + delta, nr, false); > >>>>> f2fs_unlock_op(sbi); > >>>>> } > >>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> > >>>>> /* write out all moved pages, if possible */ > >>>>> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > >>>>> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > >>>>> f2fs_i_size_write(inode, new_size); > >>>>> out: > >>>>> up_write(&F2FS_I(inode)->i_mmap_sem); > >>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>>> return ret; > >>>>> } > >>>>> > >>>>> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > >>>>> > >>>>> inode_lock(inode); > >>>>> > >>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > >>>> > >>>> After removing lock here, how can we handle below condition: > >>>> > >>>> commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 > >>>> Author: Chao Yu <yuchao0@huawei.com> > >>>> Date: Tue Apr 17 17:51:28 2018 +0800 > >>>> > >>>> f2fs: fix race in between GC and atomic open > >>>> > >>>> Thread GC thread > >>>> - f2fs_ioc_start_atomic_write > >>>> - get_dirty_pages > >>>> - filemap_write_and_wait_range > >>>> - f2fs_gc > >>>> - do_garbage_collect > >>>> - gc_data_segment > >>>> - move_data_page > >>>> - f2fs_is_atomic_file > >>>> - set_page_dirty > >>>> - set_inode_flag(, FI_ATOMIC_FILE) > >>>> > >>>> Dirty data page can still be generated by GC in race condition as > >>>> above call stack. > >>>> > >>>> This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write > >>>> to avoid such race. > >>> > >>> "f2fs: don't allow any writes on aborted atomic writes" disallows any writes > >>> on atomic file which has the revoking flag. So, this won't happen. In GC, > >> > >> Hmmm... In above condition, it's not related to FI_ATOMIC_REVOKE_REQUEST flag > >> since we do not drop any inmem pages for atomic file. > >> > >> That patch was trying to eliminate a hole which exists in between > >> filemap_write_and_wait_range and set_inode_flag(, FI_ATOMIC_FILE), where GC can > >> still dirty page in the inode, it can pollute isolation of database transaction, > >> so that is why we need this lock. > > > > Ah, GC can generate any dirty pages of atomic_written data before starting > > another transaction, right? > > Yes, > > > > > I think we can do > > - set_inode_flag() first, followed by > > - filemap_write_and_wait_range(). > > If there is redirty flow during filemap_write_and_wait_range, the page can be > register as inmem one? > > f2fs_set_data_page_dirty() > ... > if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { > if (!IS_ATOMIC_WRITTEN_PAGE(page)) { > f2fs_register_inmem_page(inode, page); > return 1; > } > > Another concern is set_inode_flag and filemap_write_and_wait_range can be > reorder by CPU pipeline, so the serial should be? > > set_inode_flag(, FI_ATOMIC_COMMIT) > smp_mb() > set_inode_flag(, FI_ATOMIC_FILE) > smp_mb() > > ret = filemap_write_and_wait_range > if (ret) > goto err_out; > > clear_inode_flag(, FI_ATOMIC_COMMIT) > > Is that right? Getting to the wrong way. Let me try rwsem again in the narrowed scope like this. From f6341121ee0c07fa834960a7c86cb0ea3f824231 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim <jaegeuk@kernel.org> Date: Wed, 25 Jul 2018 12:11:56 +0900 Subject: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- v2: - add rwsem on start_atomic_write fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 71 ++++++++++++++++++++++++----------------------- fs/f2fs/gc.c | 22 +++++++++++---- fs/f2fs/segment.c | 5 +++- fs/f2fs/segment.h | 2 +- 5 files changed, 58 insertions(+), 43 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a9447c7d6570..50349780001b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 78c1bd6b8497..a960869bf60f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_unlock_op(sbi); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) goto out_unlock; - truncate_pagecache(inode, offset); - - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) goto out_unlock; @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, len >> F2FS_BLKSIZE_BITS, false); @@ -2442,14 +2442,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, else if (dst_osize != dst->i_size) f2fs_i_size_write(dst, dst_osize); } + if (src != dst) + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); f2fs_unlock_op(sbi); out_unlock: - if (src != dst) { - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e352fbd33848..cac317e37306 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3662e1f429b4..15b3b095fd58 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); mutex_lock(&fi->inmem_lock); err = __f2fs_commit_inmem_pages(inode); @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 10:00 ` Jaegeuk Kim @ 2018-08-04 2:31 ` Chao Yu 2018-08-12 10:24 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-08-04 2:31 UTC (permalink / raw) To: Jaegeuk Kim, Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 2018/7/30 18:00, Jaegeuk Kim wrote: > On 07/30, Chao Yu wrote: >> On 2018/7/30 17:08, Jaegeuk Kim wrote: >>> On 07/30, Chao Yu wrote: >>>> On 2018/7/30 12:18, Jaegeuk Kim wrote: >>>>> On 07/30, Chao Yu wrote: >>>>>> On 2018/7/30 9:32, Jaegeuk Kim wrote: >>>>>>> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of >>>>>>> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. >>>>>>> >>>>>>> If it hits the miximum retrials in GC, let's give a chance to release >>>>>>> gc_mutex for a short time in order not to go into live lock in the worst >>>>>>> case. >>>>>>> >>>>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> >>>>>>> --- >>>>>>> fs/f2fs/f2fs.h | 1 + >>>>>>> fs/f2fs/file.c | 62 ++++++++++++++++++++++------------------------- >>>>>>> fs/f2fs/gc.c | 22 ++++++++++++----- >>>>>>> fs/f2fs/segment.c | 5 +++- >>>>>>> fs/f2fs/segment.h | 2 +- >>>>>>> 5 files changed, 51 insertions(+), 41 deletions(-) >>>>>>> >>>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >>>>>>> index a9447c7d6570..50349780001b 100644 >>>>>>> --- a/fs/f2fs/f2fs.h >>>>>>> +++ b/fs/f2fs/f2fs.h >>>>>>> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { >>>>>>> unsigned int gc_mode; /* current GC state */ >>>>>>> /* for skip statistic */ >>>>>>> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ >>>>>>> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ >>>>>>> >>>>>>> /* threshold for gc trials on pinned files */ >>>>>>> u64 gc_pin_file_threshold; >>>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >>>>>>> index 78c1bd6b8497..2b7d26ebb294 100644 >>>>>>> --- a/fs/f2fs/file.c >>>>>>> +++ b/fs/f2fs/file.c >>>>>>> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>>> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>>>>>> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) >>>>>>> { >>>>>>> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); >>>>>>> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>>>>>> + pgoff_t start = offset >> PAGE_SHIFT; >>>>>>> + pgoff_t end = (offset + len) >> PAGE_SHIFT; >>>>>>> int ret; >>>>>>> >>>>>>> f2fs_balance_fs(sbi, true); >>>>>>> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >>>>>>> >>>>>>> f2fs_drop_extent_tree(inode); >>>>>>> >>>>>>> + /* avoid gc operation during block exchange */ >>>>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> + truncate_pagecache(inode, offset); >>>>>>> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); >>>>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> + >>>>>>> f2fs_unlock_op(sbi); >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>>> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> { >>>>>>> - pgoff_t pg_start, pg_end; >>>>>>> loff_t new_size; >>>>>>> int ret; >>>>>>> >>>>>>> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> if (ret) >>>>>>> return ret; >>>>>>> >>>>>>> - pg_start = offset >> PAGE_SHIFT; >>>>>>> - pg_end = (offset + len) >> PAGE_SHIFT; >>>>>>> - >>>>>>> - /* avoid gc operation during block exchange */ >>>>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> - >>>>>>> down_write(&F2FS_I(inode)->i_mmap_sem); >>>>>>> /* write out all dirty pages from offset */ >>>>>>> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>>>>>> if (ret) >>>>>>> goto out_unlock; >>>>>>> >>>>>>> - truncate_pagecache(inode, offset); >>>>>>> - >>>>>>> - ret = f2fs_do_collapse(inode, pg_start, pg_end); >>>>>>> + ret = f2fs_do_collapse(inode, offset, len); >>>>>>> if (ret) >>>>>>> goto out_unlock; >>>>>>> >>>>>>> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> f2fs_i_size_write(inode, new_size); >>>>>>> out_unlock: >>>>>>> up_write(&F2FS_I(inode)->i_mmap_sem); >>>>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>>> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> >>>>>>> f2fs_balance_fs(sbi, true); >>>>>>> >>>>>>> - /* avoid gc operation during block exchange */ >>>>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> - >>>>>>> down_write(&F2FS_I(inode)->i_mmap_sem); >>>>>>> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); >>>>>>> if (ret) >>>>>>> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> if (ret) >>>>>>> goto out; >>>>>>> >>>>>>> - truncate_pagecache(inode, offset); >>>>>>> - >>>>>>> pg_start = offset >> PAGE_SHIFT; >>>>>>> pg_end = (offset + len) >> PAGE_SHIFT; >>>>>>> delta = pg_end - pg_start; >>>>>>> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >>>>>>> >>>>>>> + /* avoid gc operation during block exchange */ >>>>>>> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> + truncate_pagecache(inode, offset); >>>>>>> + >>>>>>> while (!ret && idx > pg_start) { >>>>>>> nr = idx - pg_start; >>>>>>> if (nr > delta) >>>>>>> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> idx + delta, nr, false); >>>>>>> f2fs_unlock_op(sbi); >>>>>>> } >>>>>>> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> >>>>>>> /* write out all moved pages, if possible */ >>>>>>> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >>>>>>> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >>>>>>> f2fs_i_size_write(inode, new_size); >>>>>>> out: >>>>>>> up_write(&F2FS_I(inode)->i_mmap_sem); >>>>>>> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>>> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >>>>>>> >>>>>>> inode_lock(inode); >>>>>>> >>>>>>> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >>>>>> >>>>>> After removing lock here, how can we handle below condition: >>>>>> >>>>>> commit 27319ba4044c0c67d62ae39e53c0118c89f0a029 >>>>>> Author: Chao Yu <yuchao0@huawei.com> >>>>>> Date: Tue Apr 17 17:51:28 2018 +0800 >>>>>> >>>>>> f2fs: fix race in between GC and atomic open >>>>>> >>>>>> Thread GC thread >>>>>> - f2fs_ioc_start_atomic_write >>>>>> - get_dirty_pages >>>>>> - filemap_write_and_wait_range >>>>>> - f2fs_gc >>>>>> - do_garbage_collect >>>>>> - gc_data_segment >>>>>> - move_data_page >>>>>> - f2fs_is_atomic_file >>>>>> - set_page_dirty >>>>>> - set_inode_flag(, FI_ATOMIC_FILE) >>>>>> >>>>>> Dirty data page can still be generated by GC in race condition as >>>>>> above call stack. >>>>>> >>>>>> This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write >>>>>> to avoid such race. >>>>> >>>>> "f2fs: don't allow any writes on aborted atomic writes" disallows any writes >>>>> on atomic file which has the revoking flag. So, this won't happen. In GC, >>>> >>>> Hmmm... In above condition, it's not related to FI_ATOMIC_REVOKE_REQUEST flag >>>> since we do not drop any inmem pages for atomic file. >>>> >>>> That patch was trying to eliminate a hole which exists in between >>>> filemap_write_and_wait_range and set_inode_flag(, FI_ATOMIC_FILE), where GC can >>>> still dirty page in the inode, it can pollute isolation of database transaction, >>>> so that is why we need this lock. >>> >>> Ah, GC can generate any dirty pages of atomic_written data before starting >>> another transaction, right? >> >> Yes, >> >>> >>> I think we can do >>> - set_inode_flag() first, followed by >>> - filemap_write_and_wait_range(). >> >> If there is redirty flow during filemap_write_and_wait_range, the page can be >> register as inmem one? >> >> f2fs_set_data_page_dirty() >> ... >> if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { >> if (!IS_ATOMIC_WRITTEN_PAGE(page)) { >> f2fs_register_inmem_page(inode, page); >> return 1; >> } >> >> Another concern is set_inode_flag and filemap_write_and_wait_range can be >> reorder by CPU pipeline, so the serial should be? >> >> set_inode_flag(, FI_ATOMIC_COMMIT) >> smp_mb() >> set_inode_flag(, FI_ATOMIC_FILE) >> smp_mb() >> >> ret = filemap_write_and_wait_range >> if (ret) >> goto err_out; >> >> clear_inode_flag(, FI_ATOMIC_COMMIT) >> >> Is that right? > > Getting to the wrong way. Let me try rwsem again in the narrowed scope like > this. How about keep lock order as: - inode_lock - i_mmap_sem - lock_all() - unlock_all() - i_gc_rwsem[WRITE] - lock_op() Thanks, > > From f6341121ee0c07fa834960a7c86cb0ea3f824231 Mon Sep 17 00:00:00 2001 > From: Jaegeuk Kim <jaegeuk@kernel.org> > Date: Wed, 25 Jul 2018 12:11:56 +0900 > Subject: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc > > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > If it hits the miximum retrials in GC, let's give a chance to release > gc_mutex for a short time in order not to go into live lock in the worst > case. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > --- > v2: > - add rwsem on start_atomic_write > > fs/f2fs/f2fs.h | 1 + > fs/f2fs/file.c | 71 ++++++++++++++++++++++++----------------------- > fs/f2fs/gc.c | 22 +++++++++++---- > fs/f2fs/segment.c | 5 +++- > fs/f2fs/segment.h | 2 +- > 5 files changed, 58 insertions(+), 43 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index a9447c7d6570..50349780001b 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > unsigned int gc_mode; /* current GC state */ > /* for skip statistic */ > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > /* threshold for gc trials on pinned files */ > u64 gc_pin_file_threshold; > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 78c1bd6b8497..a960869bf60f 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, > return ret; > } > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > { > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > + pgoff_t start = offset >> PAGE_SHIFT; > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > int ret; > > f2fs_balance_fs(sbi, true); > @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > f2fs_drop_extent_tree(inode); > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + > f2fs_unlock_op(sbi); > return ret; > } > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > { > - pgoff_t pg_start, pg_end; > loff_t new_size; > int ret; > > @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - pg_start = offset >> PAGE_SHIFT; > - pg_end = (offset + len) >> PAGE_SHIFT; > - > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > goto out_unlock; > > - truncate_pagecache(inode, offset); > - > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > + ret = f2fs_do_collapse(inode, offset, len); > if (ret) > goto out_unlock; > > @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out_unlock: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_balance_fs(sbi, true); > > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > if (ret) > @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > goto out; > > - truncate_pagecache(inode, offset); > - > pg_start = offset >> PAGE_SHIFT; > pg_end = (offset + len) >> PAGE_SHIFT; > delta = pg_end - pg_start; > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > + > while (!ret && idx > pg_start) { > nr = idx - pg_start; > if (nr > delta) > @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > idx + delta, nr, false); > f2fs_unlock_op(sbi); > } > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > /* write out all moved pages, if possible */ > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > inode_lock(inode); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > if (f2fs_is_atomic_file(inode)) { > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > ret = -EINVAL; > @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > if (ret) > goto out; > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + > if (!get_dirty_pages(inode)) > goto skip_flush; > > @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > "Unexpected flush for atomic writes: ino=%lu, npages=%u", > inode->i_ino, get_dirty_pages(inode)); > ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); > - if (ret) > + if (ret) { > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > goto out; > + } > skip_flush: > set_inode_flag(inode, FI_ATOMIC_FILE); > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > F2FS_I(inode)->inmem_task = current; > stat_inc_atomic_write(inode); > stat_update_max_atomic_write(inode); > out: > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > if (ret) > return ret; > > - inode_lock(inode); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + inode_lock(inode); > > if (f2fs_is_volatile_file(inode)) { > ret = -EINVAL; > @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > ret = -EINVAL; > } > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > } > > inode_lock(src); > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > if (src != dst) { > ret = -EBUSY; > if (!inode_trylock(dst)) > goto out; > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > - inode_unlock(dst); > - goto out; > - } > } > > ret = -EINVAL; > @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > f2fs_balance_fs(sbi, true); > f2fs_lock_op(sbi); > + > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > + if (src != dst) { > + ret = -EBUSY; > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > + goto out_src; > + } > + > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > pos_out >> F2FS_BLKSIZE_BITS, > len >> F2FS_BLKSIZE_BITS, false); > @@ -2442,14 +2442,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > else if (dst_osize != dst->i_size) > f2fs_i_size_write(dst, dst_osize); > } > + if (src != dst) > + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > +out_src: > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > f2fs_unlock_op(sbi); > out_unlock: > - if (src != dst) { > - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > + if (src != dst) > inode_unlock(dst); > - } > out: > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > inode_unlock(src); > return ret; > } > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > index e352fbd33848..cac317e37306 100644 > --- a/fs/f2fs/gc.c > +++ b/fs/f2fs/gc.c > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > if (!down_write_trylock( > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > iput(inode); > + sbi->skipped_gc_rwsem++; > continue; > } > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > continue; > if (!down_write_trylock( > &fi->i_gc_rwsem[WRITE])) { > + sbi->skipped_gc_rwsem++; > up_write(&fi->i_gc_rwsem[READ]); > continue; > } > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > prefree_segments(sbi)); > > cpc.reason = __get_cp_reason(sbi); > + sbi->skipped_gc_rwsem = 0; > gc_more: > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > ret = -EINVAL; > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > total_freed += seg_freed; > > if (gc_type == FG_GC) { > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > + sbi->skipped_gc_rwsem) > skipped_round++; > last_skipped = sbi->skipped_atomic_files[FG_GC]; > round++; > @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > if (gc_type == FG_GC) > sbi->cur_victim_sec = NULL_SEGNO; > > - if (!sync) { > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > - skipped_round * 2 >= round) > - f2fs_drop_inmem_pages_all(sbi, true); > + if (sync) > + goto stop; > + > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > + if (skipped_round <= MAX_SKIP_GC_COUNT || > + skipped_round * 2 < round) { > segno = NULL_SEGNO; > goto gc_more; > } > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > + f2fs_drop_inmem_pages_all(sbi, true); > + segno = NULL_SEGNO; > + goto gc_more; > + } > if (gc_type == FG_GC) > ret = f2fs_write_checkpoint(sbi, &cpc); > } > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > index 3662e1f429b4..15b3b095fd58 100644 > --- a/fs/f2fs/segment.c > +++ b/fs/f2fs/segment.c > @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > struct f2fs_inode_info *fi = F2FS_I(inode); > int err; > > - f2fs_balance_fs(sbi, true); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > + > f2fs_lock_op(sbi); > > set_inode_flag(inode, FI_ATOMIC_COMMIT); > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > mutex_lock(&fi->inmem_lock); > err = __f2fs_commit_inmem_pages(inode); > @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > mutex_unlock(&fi->inmem_lock); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > f2fs_unlock_op(sbi); > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > index 50495515f0a0..b3d9e317ff0c 100644 > --- a/fs/f2fs/segment.h > +++ b/fs/f2fs/segment.h > @@ -215,7 +215,7 @@ struct segment_allocation { > #define IS_DUMMY_WRITTEN_PAGE(page) \ > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > -#define MAX_SKIP_ATOMIC_COUNT 16 > +#define MAX_SKIP_GC_COUNT 16 > > struct inmem_pages { > struct list_head list; > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-04 2:31 ` [f2fs-dev] " Chao Yu @ 2018-08-12 10:24 ` Chao Yu 2018-08-13 20:12 ` Jaegeuk Kim 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-08-12 10:24 UTC (permalink / raw) To: Jaegeuk Kim, Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 2018/8/4 10:31, Chao Yu wrote: > How about keep lock order as: > > - inode_lock > - i_mmap_sem > - lock_all() > - unlock_all() > - i_gc_rwsem[WRITE] > - lock_op() I got below warning when testing last dev-test: - f2fs_direct_IO current lock dependency - i_gc_rwsem[WRITE] - i_mmap_sem - do_blockdev_direct_IO - i_mmap_sem - i_gc_rwsem[WRITE] So I guess still we should grab i_gc_rwsem[WRITE] lock before i_mmap_sem, any idea? run fstests generic/208 at 2018-08-12 18:10:39 ====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc2+ #39 Tainted: G O ------------------------------------------------------ aio-dio-invalid/20621 is trying to acquire lock: e47a5a00 (&mm->mmap_sem){++++}, at: get_user_pages_unlocked+0x38/0x1d0 but task is already holding lock: 82073b2b (&fi->i_gc_rwsem[WRITE]){++++}, at: f2fs_direct_IO+0x16c/0x590 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&fi->i_gc_rwsem[WRITE]){++++}: lock_acquire+0xae/0x220 down_write+0x38/0x60 f2fs_setattr+0x187/0x5b0 [f2fs] notify_change+0x22b/0x400 do_truncate+0x5c/0x90 path_openat+0xaf5/0x1370 do_filp_open+0x5c/0xb0 do_sys_open+0xf8/0x1d0 sys_open+0x22/0x30 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x53/0x86 -> #1 (&fi->i_mmap_sem){++++}: lock_acquire+0xae/0x220 down_read+0x38/0x60 f2fs_filemap_fault+0x21/0x40 [f2fs] __do_fault+0x16/0x30 handle_mm_fault+0xa37/0x10c0 __do_page_fault+0x19f/0x530 do_page_fault+0x20/0x280 common_exception+0x89/0x8e -> #0 (&mm->mmap_sem){++++}: __lock_acquire+0xe89/0x10e0 lock_acquire+0xae/0x220 down_read+0x38/0x60 get_user_pages_unlocked+0x38/0x1d0 get_user_pages_fast+0x70/0xe1 iov_iter_get_pages+0x94/0x250 do_blockdev_direct_IO+0x2191/0x25a0 __blockdev_direct_IO+0x4a/0x50 f2fs_direct_IO+0x332/0x590 [f2fs] generic_file_direct_write+0xe9/0x2c0 __generic_file_write_iter+0x9a/0x1f0 f2fs_file_write_iter+0xdd/0x3b0 [f2fs] aio_write.isra.20+0xe0/0x190 sys_io_submit+0x464/0x650 do_int80_syscall_32+0x6c/0x190 restore_all+0x0/0x6a other info that might help us debug this: Chain exists of: &mm->mmap_sem --> &fi->i_mmap_sem --> &fi->i_gc_rwsem[WRITE] Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fi->i_gc_rwsem[WRITE]); lock(&fi->i_mmap_sem); lock(&fi->i_gc_rwsem[WRITE]); lock(&mm->mmap_sem); *** DEADLOCK *** 2 locks held by aio-dio-invalid/20621: #0: ca54a0ec (&sb->s_type->i_mutex_key#17){+.+.}, at: f2fs_file_write_iter+0x6f/0x3b0 [f2fs] #1: 82073b2b (&fi->i_gc_rwsem[WRITE]){++++}, at: f2fs_direct_IO+0x16c/0x590 [f2fs] stack backtrace: CPU: 1 PID: 20621 Comm: aio-dio-invalid Tainted: G O 4.18.0-rc2+ #39 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack+0x5f/0x86 print_circular_bug.isra.35+0x1b6/0x1c0 check_prev_add.constprop.44+0x67a/0x6a0 __lock_acquire+0xe89/0x10e0 lock_acquire+0xae/0x220 ? get_user_pages_unlocked+0x38/0x1d0 down_read+0x38/0x60 ? get_user_pages_unlocked+0x38/0x1d0 get_user_pages_unlocked+0x38/0x1d0 ? mark_held_locks+0x5d/0x80 ? get_user_pages_fast+0xb7/0xe1 ? trace_hardirqs_on_caller+0xdd/0x1c0 get_user_pages_fast+0x70/0xe1 iov_iter_get_pages+0x94/0x250 ? lockdep_init_map+0x12/0x20 ? __raw_spin_lock_init+0x31/0x60 do_blockdev_direct_IO+0x2191/0x25a0 ? __blockdev_direct_IO+0x4a/0x50 ? __this_cpu_preempt_check+0xf/0x20 ? free_unref_page_list+0x1c7/0x2a0 ? trace_hardirqs_on_caller+0xdd/0x1c0 ? __get_data_block+0xc0/0xc0 [f2fs] __blockdev_direct_IO+0x4a/0x50 ? __get_data_block+0xc0/0xc0 [f2fs] f2fs_direct_IO+0x332/0x590 [f2fs] ? __get_data_block+0xc0/0xc0 [f2fs] generic_file_direct_write+0xe9/0x2c0 __generic_file_write_iter+0x9a/0x1f0 f2fs_file_write_iter+0xdd/0x3b0 [f2fs] aio_write.isra.20+0xe0/0x190 ? sys_io_submit+0x1ab/0x650 sys_io_submit+0x464/0x650 ? sys_io_submit+0x13a/0x650 do_int80_syscall_32+0x6c/0x190 entry_INT80_32+0x36/0x36 > > Thanks, > >> >> From f6341121ee0c07fa834960a7c86cb0ea3f824231 Mon Sep 17 00:00:00 2001 >> From: Jaegeuk Kim <jaegeuk@kernel.org> >> Date: Wed, 25 Jul 2018 12:11:56 +0900 >> Subject: [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc >> >> The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of >> fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. >> >> If it hits the miximum retrials in GC, let's give a chance to release >> gc_mutex for a short time in order not to go into live lock in the worst >> case. >> >> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> >> --- >> v2: >> - add rwsem on start_atomic_write >> >> fs/f2fs/f2fs.h | 1 + >> fs/f2fs/file.c | 71 ++++++++++++++++++++++++----------------------- >> fs/f2fs/gc.c | 22 +++++++++++---- >> fs/f2fs/segment.c | 5 +++- >> fs/f2fs/segment.h | 2 +- >> 5 files changed, 58 insertions(+), 43 deletions(-) >> >> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >> index a9447c7d6570..50349780001b 100644 >> --- a/fs/f2fs/f2fs.h >> +++ b/fs/f2fs/f2fs.h >> @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { >> unsigned int gc_mode; /* current GC state */ >> /* for skip statistic */ >> unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ >> + unsigned long long skipped_gc_rwsem; /* FG_GC only */ >> >> /* threshold for gc trials on pinned files */ >> u64 gc_pin_file_threshold; >> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >> index 78c1bd6b8497..a960869bf60f 100644 >> --- a/fs/f2fs/file.c >> +++ b/fs/f2fs/file.c >> @@ -1179,10 +1179,12 @@ static int __exchange_data_block(struct inode *src_inode, >> return ret; >> } >> >> -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >> +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) >> { >> struct f2fs_sb_info *sbi = F2FS_I_SB(inode); >> pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >> + pgoff_t start = offset >> PAGE_SHIFT; >> + pgoff_t end = (offset + len) >> PAGE_SHIFT; >> int ret; >> >> f2fs_balance_fs(sbi, true); >> @@ -1190,14 +1192,18 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) >> >> f2fs_drop_extent_tree(inode); >> >> + /* avoid gc operation during block exchange */ >> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> + truncate_pagecache(inode, offset); >> ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); >> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> + >> f2fs_unlock_op(sbi); >> return ret; >> } >> >> static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> { >> - pgoff_t pg_start, pg_end; >> loff_t new_size; >> int ret; >> >> @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> if (ret) >> return ret; >> >> - pg_start = offset >> PAGE_SHIFT; >> - pg_end = (offset + len) >> PAGE_SHIFT; >> - >> - /* avoid gc operation during block exchange */ >> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> - >> down_write(&F2FS_I(inode)->i_mmap_sem); >> /* write out all dirty pages from offset */ >> ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >> if (ret) >> goto out_unlock; >> >> - truncate_pagecache(inode, offset); >> - >> - ret = f2fs_do_collapse(inode, pg_start, pg_end); >> + ret = f2fs_do_collapse(inode, offset, len); >> if (ret) >> goto out_unlock; >> >> @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> f2fs_i_size_write(inode, new_size); >> out_unlock: >> up_write(&F2FS_I(inode)->i_mmap_sem); >> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> return ret; >> } >> >> @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >> >> f2fs_balance_fs(sbi, true); >> >> - /* avoid gc operation during block exchange */ >> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> - >> down_write(&F2FS_I(inode)->i_mmap_sem); >> ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); >> if (ret) >> @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >> if (ret) >> goto out; >> >> - truncate_pagecache(inode, offset); >> - >> pg_start = offset >> PAGE_SHIFT; >> pg_end = (offset + len) >> PAGE_SHIFT; >> delta = pg_end - pg_start; >> idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; >> >> + /* avoid gc operation during block exchange */ >> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> + truncate_pagecache(inode, offset); >> + >> while (!ret && idx > pg_start) { >> nr = idx - pg_start; >> if (nr > delta) >> @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >> idx + delta, nr, false); >> f2fs_unlock_op(sbi); >> } >> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> >> /* write out all moved pages, if possible */ >> filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); >> @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) >> f2fs_i_size_write(inode, new_size); >> out: >> up_write(&F2FS_I(inode)->i_mmap_sem); >> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> return ret; >> } >> >> @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >> >> inode_lock(inode); >> >> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> - >> if (f2fs_is_atomic_file(inode)) { >> if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) >> ret = -EINVAL; >> @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >> if (ret) >> goto out; >> >> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> + >> if (!get_dirty_pages(inode)) >> goto skip_flush; >> >> @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) >> "Unexpected flush for atomic writes: ino=%lu, npages=%u", >> inode->i_ino, get_dirty_pages(inode)); >> ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); >> - if (ret) >> + if (ret) { >> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> goto out; >> + } >> skip_flush: >> set_inode_flag(inode, FI_ATOMIC_FILE); >> clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); >> - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); >> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> >> + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); >> F2FS_I(inode)->inmem_task = current; >> stat_inc_atomic_write(inode); >> stat_update_max_atomic_write(inode); >> out: >> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> inode_unlock(inode); >> mnt_drop_write_file(filp); >> return ret; >> @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >> if (ret) >> return ret; >> >> - inode_lock(inode); >> + f2fs_balance_fs(F2FS_I_SB(inode), true); >> >> - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> + inode_lock(inode); >> >> if (f2fs_is_volatile_file(inode)) { >> ret = -EINVAL; >> @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) >> clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); >> ret = -EINVAL; >> } >> - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> inode_unlock(inode); >> mnt_drop_write_file(filp); >> return ret; >> @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >> } >> >> inode_lock(src); >> - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >> if (src != dst) { >> ret = -EBUSY; >> if (!inode_trylock(dst)) >> goto out; >> - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { >> - inode_unlock(dst); >> - goto out; >> - } >> } >> >> ret = -EINVAL; >> @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >> >> f2fs_balance_fs(sbi, true); >> f2fs_lock_op(sbi); >> + >> + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >> + if (src != dst) { >> + ret = -EBUSY; >> + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) >> + goto out_src; >> + } >> + >> ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, >> pos_out >> F2FS_BLKSIZE_BITS, >> len >> F2FS_BLKSIZE_BITS, false); >> @@ -2442,14 +2442,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, >> else if (dst_osize != dst->i_size) >> f2fs_i_size_write(dst, dst_osize); >> } >> + if (src != dst) >> + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >> +out_src: >> + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >> f2fs_unlock_op(sbi); >> out_unlock: >> - if (src != dst) { >> - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); >> + if (src != dst) >> inode_unlock(dst); >> - } >> out: >> - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); >> inode_unlock(src); >> return ret; >> } >> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c >> index e352fbd33848..cac317e37306 100644 >> --- a/fs/f2fs/gc.c >> +++ b/fs/f2fs/gc.c >> @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >> if (!down_write_trylock( >> &F2FS_I(inode)->i_gc_rwsem[WRITE])) { >> iput(inode); >> + sbi->skipped_gc_rwsem++; >> continue; >> } >> >> @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, >> continue; >> if (!down_write_trylock( >> &fi->i_gc_rwsem[WRITE])) { >> + sbi->skipped_gc_rwsem++; >> up_write(&fi->i_gc_rwsem[READ]); >> continue; >> } >> @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >> prefree_segments(sbi)); >> >> cpc.reason = __get_cp_reason(sbi); >> + sbi->skipped_gc_rwsem = 0; >> gc_more: >> if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { >> ret = -EINVAL; >> @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >> total_freed += seg_freed; >> >> if (gc_type == FG_GC) { >> - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) >> + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || >> + sbi->skipped_gc_rwsem) >> skipped_round++; >> last_skipped = sbi->skipped_atomic_files[FG_GC]; >> round++; >> @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >> if (gc_type == FG_GC) >> sbi->cur_victim_sec = NULL_SEGNO; >> >> - if (!sync) { >> - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >> - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && >> - skipped_round * 2 >= round) >> - f2fs_drop_inmem_pages_all(sbi, true); >> + if (sync) >> + goto stop; >> + >> + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { >> + if (skipped_round <= MAX_SKIP_GC_COUNT || >> + skipped_round * 2 < round) { >> segno = NULL_SEGNO; >> goto gc_more; >> } >> >> + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { >> + f2fs_drop_inmem_pages_all(sbi, true); >> + segno = NULL_SEGNO; >> + goto gc_more; >> + } >> if (gc_type == FG_GC) >> ret = f2fs_write_checkpoint(sbi, &cpc); >> } >> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c >> index 3662e1f429b4..15b3b095fd58 100644 >> --- a/fs/f2fs/segment.c >> +++ b/fs/f2fs/segment.c >> @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) >> struct f2fs_inode_info *fi = F2FS_I(inode); >> int err; >> >> - f2fs_balance_fs(sbi, true); >> + f2fs_balance_fs(F2FS_I_SB(inode), true); >> + >> f2fs_lock_op(sbi); >> >> set_inode_flag(inode, FI_ATOMIC_COMMIT); >> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> >> mutex_lock(&fi->inmem_lock); >> err = __f2fs_commit_inmem_pages(inode); >> @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) >> spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); >> mutex_unlock(&fi->inmem_lock); >> >> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); >> clear_inode_flag(inode, FI_ATOMIC_COMMIT); >> >> f2fs_unlock_op(sbi); >> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h >> index 50495515f0a0..b3d9e317ff0c 100644 >> --- a/fs/f2fs/segment.h >> +++ b/fs/f2fs/segment.h >> @@ -215,7 +215,7 @@ struct segment_allocation { >> #define IS_DUMMY_WRITTEN_PAGE(page) \ >> (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) >> >> -#define MAX_SKIP_ATOMIC_COUNT 16 >> +#define MAX_SKIP_GC_COUNT 16 >> >> struct inmem_pages { >> struct list_head list; >> > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most > engaging tech sites, Slashdot.org! http://sdm.link/slashdot > _______________________________________________ > Linux-f2fs-devel mailing list > Linux-f2fs-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-12 10:24 ` Chao Yu @ 2018-08-13 20:12 ` Jaegeuk Kim 2018-08-14 6:21 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-13 20:12 UTC (permalink / raw) To: Chao Yu; +Cc: Chao Yu, linux-kernel, linux-f2fs-devel On 08/12, Chao Yu wrote: > On 2018/8/4 10:31, Chao Yu wrote: > > How about keep lock order as: > > > > - inode_lock > > - i_mmap_sem > > - lock_all() > > - unlock_all() > > - i_gc_rwsem[WRITE] > > - lock_op() > > I got below warning when testing last dev-test: > > - f2fs_direct_IO current lock dependency > - i_gc_rwsem[WRITE] > - i_mmap_sem > - do_blockdev_direct_IO > - i_mmap_sem > - i_gc_rwsem[WRITE] > Yeah, it seems it's true. How about this? --- fs/f2fs/data.c | 4 ++-- fs/f2fs/file.c | 43 +++++++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f09231b1cc74..021923dc666b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2208,14 +2208,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 560751adba01..8b13afb23734 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -798,8 +798,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { bool to_smaller = (attr->ia_size <= i_size_read(inode)); - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); @@ -809,8 +809,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -963,8 +963,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -973,8 +973,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1201,6 +1201,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1208,6 +1209,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1228,17 +1230,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out_unlock; + return ret; ret = f2fs_do_collapse(inode, offset, len); if (ret) - goto out_unlock; + return ret; /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); @@ -1246,10 +1248,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, new_size); ret = f2fs_truncate_blocks(inode, new_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out_unlock: - up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1315,10 +1316,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - goto out_sem; + return ret; pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1330,7 +1330,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1338,7 +1338,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1350,6 +1350,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1361,6 +1362,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1372,6 +1374,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1400,9 +1403,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, else f2fs_i_size_write(inode, new_size); } -out_sem: - up_write(&F2FS_I(inode)->i_mmap_sem); - return ret; } @@ -1433,13 +1433,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) - goto out; + return ret; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; + return ret; pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; @@ -1448,6 +1449,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1463,16 +1465,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out: - up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-13 20:12 ` Jaegeuk Kim @ 2018-08-14 6:21 ` Chao Yu 0 siblings, 0 replies; 21+ messages in thread From: Chao Yu @ 2018-08-14 6:21 UTC (permalink / raw) To: Jaegeuk Kim, Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 2018/8/14 4:12, Jaegeuk Kim wrote: > On 08/12, Chao Yu wrote: >> On 2018/8/4 10:31, Chao Yu wrote: >>> How about keep lock order as: >>> >>> - inode_lock >>> - i_mmap_sem >>> - lock_all() >>> - unlock_all() >>> - i_gc_rwsem[WRITE] >>> - lock_op() >> >> I got below warning when testing last dev-test: >> >> - f2fs_direct_IO current lock dependency >> - i_gc_rwsem[WRITE] >> - i_mmap_sem >> - do_blockdev_direct_IO >> - i_mmap_sem >> - i_gc_rwsem[WRITE] >> > > Yeah, it seems it's true. > How about this? It looks good to me, anyway, let me check this patch with fstests again. Thanks, > > --- > fs/f2fs/data.c | 4 ++-- > fs/f2fs/file.c | 43 +++++++++++++++++++++++-------------------- > 2 files changed, 25 insertions(+), 22 deletions(-) > > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c > index f09231b1cc74..021923dc666b 100644 > --- a/fs/f2fs/data.c > +++ b/fs/f2fs/data.c > @@ -2208,14 +2208,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) > loff_t i_size = i_size_read(inode); > > if (to > i_size) { > - down_write(&F2FS_I(inode)->i_mmap_sem); > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > > truncate_pagecache(inode, i_size); > f2fs_truncate_blocks(inode, i_size, true); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > up_write(&F2FS_I(inode)->i_mmap_sem); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > } > } > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 560751adba01..8b13afb23734 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -798,8 +798,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) > if (attr->ia_valid & ATTR_SIZE) { > bool to_smaller = (attr->ia_size <= i_size_read(inode)); > > - down_write(&F2FS_I(inode)->i_mmap_sem); > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > > truncate_setsize(inode, attr->ia_size); > > @@ -809,8 +809,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) > * do not trim all blocks after i_size if target size is > * larger than i_size. > */ > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > up_write(&F2FS_I(inode)->i_mmap_sem); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > if (err) > return err; > @@ -963,8 +963,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) > blk_start = (loff_t)pg_start << PAGE_SHIFT; > blk_end = (loff_t)pg_end << PAGE_SHIFT; > > - down_write(&F2FS_I(inode)->i_mmap_sem); > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > > truncate_inode_pages_range(mapping, blk_start, > blk_end - 1); > @@ -973,8 +973,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) > ret = f2fs_truncate_hole(inode, pg_start, pg_end); > f2fs_unlock_op(sbi); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > up_write(&F2FS_I(inode)->i_mmap_sem); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > } > } > > @@ -1201,6 +1201,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > > /* avoid gc operation during block exchange */ > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > > f2fs_lock_op(sbi); > f2fs_drop_extent_tree(inode); > @@ -1208,6 +1209,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > f2fs_unlock_op(sbi); > > + up_write(&F2FS_I(inode)->i_mmap_sem); > up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > @@ -1228,17 +1230,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - down_write(&F2FS_I(inode)->i_mmap_sem); > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > - goto out_unlock; > + return ret; > > ret = f2fs_do_collapse(inode, offset, len); > if (ret) > - goto out_unlock; > + return ret; > > /* write out all moved pages, if possible */ > + down_write(&F2FS_I(inode)->i_mmap_sem); > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > truncate_pagecache(inode, offset); > > @@ -1246,10 +1248,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > truncate_pagecache(inode, new_size); > > ret = f2fs_truncate_blocks(inode, new_size, true); > + up_write(&F2FS_I(inode)->i_mmap_sem); > if (!ret) > f2fs_i_size_write(inode, new_size); > -out_unlock: > - up_write(&F2FS_I(inode)->i_mmap_sem); > return ret; > } > > @@ -1315,10 +1316,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > if (ret) > return ret; > > - down_write(&F2FS_I(inode)->i_mmap_sem); > ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); > if (ret) > - goto out_sem; > + return ret; > > pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; > pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; > @@ -1330,7 +1330,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > ret = fill_zero(inode, pg_start, off_start, > off_end - off_start); > if (ret) > - goto out_sem; > + return ret; > > new_size = max_t(loff_t, new_size, offset + len); > } else { > @@ -1338,7 +1338,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > ret = fill_zero(inode, pg_start++, off_start, > PAGE_SIZE - off_start); > if (ret) > - goto out_sem; > + return ret; > > new_size = max_t(loff_t, new_size, > (loff_t)pg_start << PAGE_SHIFT); > @@ -1350,6 +1350,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > pgoff_t end; > > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > > truncate_pagecache_range(inode, > (loff_t)index << PAGE_SHIFT, > @@ -1361,6 +1362,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); > if (ret) { > f2fs_unlock_op(sbi); > + up_write(&F2FS_I(inode)->i_mmap_sem); > up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > goto out; > } > @@ -1372,6 +1374,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > f2fs_put_dnode(&dn); > > f2fs_unlock_op(sbi); > + up_write(&F2FS_I(inode)->i_mmap_sem); > up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > f2fs_balance_fs(sbi, dn.node_changed); > @@ -1400,9 +1403,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, > else > f2fs_i_size_write(inode, new_size); > } > -out_sem: > - up_write(&F2FS_I(inode)->i_mmap_sem); > - > return ret; > } > > @@ -1433,13 +1433,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > down_write(&F2FS_I(inode)->i_mmap_sem); > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > + up_write(&F2FS_I(inode)->i_mmap_sem); > if (ret) > - goto out; > + return ret; > > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > - goto out; > + return ret; > > pg_start = offset >> PAGE_SHIFT; > pg_end = (offset + len) >> PAGE_SHIFT; > @@ -1448,6 +1449,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > /* avoid gc operation during block exchange */ > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + down_write(&F2FS_I(inode)->i_mmap_sem); > truncate_pagecache(inode, offset); > > while (!ret && idx > pg_start) { > @@ -1463,16 +1465,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > idx + delta, nr, false); > f2fs_unlock_op(sbi); > } > + up_write(&F2FS_I(inode)->i_mmap_sem); > up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > /* write out all moved pages, if possible */ > + down_write(&F2FS_I(inode)->i_mmap_sem); > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > truncate_pagecache(inode, offset); > + up_write(&F2FS_I(inode)->i_mmap_sem); > > if (!ret) > f2fs_i_size_write(inode, new_size); > -out: > - up_write(&F2FS_I(inode)->i_mmap_sem); > return ret; > } > > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v2] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-07-30 1:32 [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc Jaegeuk Kim 2018-07-30 3:29 ` Chao Yu @ 2018-08-04 22:29 ` Jaegeuk Kim 2018-08-05 14:42 ` [f2fs-dev] " Chao Yu 2018-08-05 16:34 ` [f2fs-dev] [PATCH v3] " Jaegeuk Kim 1 sibling, 2 replies; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-04 22:29 UTC (permalink / raw) To: linux-kernel, linux-f2fs-devel The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- v2 log from v1: - keep lock order: i_gc_rwsem -> lock_op fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- fs/f2fs/gc.c | 22 ++++++++++---- fs/f2fs/segment.c | 5 +++- fs/f2fs/segment.h | 2 +- 5 files changed, 61 insertions(+), 45 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a9447c7d6570..50349780001b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 78c1bd6b8497..7bcf47f41ef1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1179,25 +1179,31 @@ static int __exchange_data_block(struct inode *src_inode, return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) goto out_unlock; - truncate_pagecache(inode, offset); - - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) goto out_unlock; @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2431,6 +2423,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_unlock; f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + f2fs_lock_op(sbi); ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, @@ -2443,13 +2443,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_i_size_write(dst, dst_osize); } f2fs_unlock_op(sbi); -out_unlock: - if (src != dst) { + + if (src != dst) up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e352fbd33848..cac317e37306 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3662e1f429b4..15b3b095fd58 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); mutex_lock(&fi->inmem_lock); err = __f2fs_commit_inmem_pages(inode); @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v2] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-04 22:29 ` [PATCH v2] " Jaegeuk Kim @ 2018-08-05 14:42 ` Chao Yu 2018-08-05 16:08 ` Jaegeuk Kim 2018-08-05 16:34 ` [f2fs-dev] [PATCH v3] " Jaegeuk Kim 1 sibling, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-08-05 14:42 UTC (permalink / raw) To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel On 2018/8/5 6:29, Jaegeuk Kim wrote: > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > If it hits the miximum retrials in GC, let's give a chance to release > gc_mutex for a short time in order not to go into live lock in the worst > case. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > --- > v2 log from v1: > - keep lock order: i_gc_rwsem -> lock_op > > fs/f2fs/f2fs.h | 1 + > fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- > fs/f2fs/gc.c | 22 ++++++++++---- > fs/f2fs/segment.c | 5 +++- > fs/f2fs/segment.h | 2 +- > 5 files changed, 61 insertions(+), 45 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index a9447c7d6570..50349780001b 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > unsigned int gc_mode; /* current GC state */ > /* for skip statistic */ > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > /* threshold for gc trials on pinned files */ > u64 gc_pin_file_threshold; > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 78c1bd6b8497..7bcf47f41ef1 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -1179,25 +1179,31 @@ static int __exchange_data_block(struct inode *src_inode, > return ret; > } > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > { > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > + pgoff_t start = offset >> PAGE_SHIFT; > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > int ret; > > f2fs_balance_fs(sbi, true); > - f2fs_lock_op(sbi); > > - f2fs_drop_extent_tree(inode); > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + f2fs_lock_op(sbi); > + f2fs_drop_extent_tree(inode); > + truncate_pagecache(inode, offset); > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > f2fs_unlock_op(sbi); > + > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > { > - pgoff_t pg_start, pg_end; > loff_t new_size; > int ret; > > @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - pg_start = offset >> PAGE_SHIFT; > - pg_end = (offset + len) >> PAGE_SHIFT; > - > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > goto out_unlock; > > - truncate_pagecache(inode, offset); > - > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > + ret = f2fs_do_collapse(inode, offset, len); > if (ret) > goto out_unlock; > > @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out_unlock: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_balance_fs(sbi, true); > > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > if (ret) > @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > goto out; > > - truncate_pagecache(inode, offset); > - > pg_start = offset >> PAGE_SHIFT; > pg_end = (offset + len) >> PAGE_SHIFT; > delta = pg_end - pg_start; > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > + > while (!ret && idx > pg_start) { > nr = idx - pg_start; > if (nr > delta) > @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > idx + delta, nr, false); > f2fs_unlock_op(sbi); > } > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > /* write out all moved pages, if possible */ > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > inode_lock(inode); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > if (f2fs_is_atomic_file(inode)) { > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > ret = -EINVAL; > @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > if (ret) > goto out; > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + > if (!get_dirty_pages(inode)) > goto skip_flush; > > @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > "Unexpected flush for atomic writes: ino=%lu, npages=%u", > inode->i_ino, get_dirty_pages(inode)); > ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); > - if (ret) > + if (ret) { > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > goto out; > + } > skip_flush: > set_inode_flag(inode, FI_ATOMIC_FILE); > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > F2FS_I(inode)->inmem_task = current; > stat_inc_atomic_write(inode); > stat_update_max_atomic_write(inode); > out: > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > if (ret) > return ret; > > - inode_lock(inode); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + inode_lock(inode); > > if (f2fs_is_volatile_file(inode)) { > ret = -EINVAL; > @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > ret = -EINVAL; > } > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > } > > inode_lock(src); > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > if (src != dst) { > ret = -EBUSY; > if (!inode_trylock(dst)) > goto out; > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > - inode_unlock(dst); > - goto out; > - } > } > > ret = -EINVAL; > @@ -2431,6 +2423,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > goto out_unlock; > > f2fs_balance_fs(sbi, true); > + > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > + if (src != dst) { > + ret = -EBUSY; > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > + goto out_src; > + } > + > f2fs_lock_op(sbi); > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > pos_out >> F2FS_BLKSIZE_BITS, > @@ -2443,13 +2443,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > f2fs_i_size_write(dst, dst_osize); > } > f2fs_unlock_op(sbi); > -out_unlock: > - if (src != dst) { > + > + if (src != dst) > up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > +out_src: > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > +out_unlock: > + if (src != dst) > inode_unlock(dst); > - } > out: > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > inode_unlock(src); > return ret; > } > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > index e352fbd33848..cac317e37306 100644 > --- a/fs/f2fs/gc.c > +++ b/fs/f2fs/gc.c > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > if (!down_write_trylock( > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > iput(inode); > + sbi->skipped_gc_rwsem++; > continue; > } > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > continue; > if (!down_write_trylock( > &fi->i_gc_rwsem[WRITE])) { > + sbi->skipped_gc_rwsem++; > up_write(&fi->i_gc_rwsem[READ]); > continue; > } > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > prefree_segments(sbi)); > > cpc.reason = __get_cp_reason(sbi); > + sbi->skipped_gc_rwsem = 0; > gc_more: > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > ret = -EINVAL; > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > total_freed += seg_freed; > > if (gc_type == FG_GC) { > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > + sbi->skipped_gc_rwsem) There should be other foreground ops like DIO can race with GC, in such case, if we increase sbi->skipped_gc_rwsem, and call f2fs_drop_inmem_pages_all() to drop inmem pages later, it doesn't help? > skipped_round++; > last_skipped = sbi->skipped_atomic_files[FG_GC]; > round++; > @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > if (gc_type == FG_GC) > sbi->cur_victim_sec = NULL_SEGNO; > > - if (!sync) { > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > - skipped_round * 2 >= round) > - f2fs_drop_inmem_pages_all(sbi, true); > + if (sync) > + goto stop; > + > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > + if (skipped_round <= MAX_SKIP_GC_COUNT || > + skipped_round * 2 < round) { > segno = NULL_SEGNO; > goto gc_more; > } > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > + f2fs_drop_inmem_pages_all(sbi, true); > + segno = NULL_SEGNO; > + goto gc_more; > + } > if (gc_type == FG_GC) > ret = f2fs_write_checkpoint(sbi, &cpc); > } > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > index 3662e1f429b4..15b3b095fd58 100644 > --- a/fs/f2fs/segment.c > +++ b/fs/f2fs/segment.c > @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > struct f2fs_inode_info *fi = F2FS_I(inode); > int err; > > - f2fs_balance_fs(sbi, true); > + f2fs_balance_fs(F2FS_I_SB(inode), true); We don't need to change sbi to F2FS_I_SB(inode). down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); Thanks, > + > f2fs_lock_op(sbi); > > set_inode_flag(inode, FI_ATOMIC_COMMIT); > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > mutex_lock(&fi->inmem_lock); > err = __f2fs_commit_inmem_pages(inode); > @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > mutex_unlock(&fi->inmem_lock); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > f2fs_unlock_op(sbi); > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > index 50495515f0a0..b3d9e317ff0c 100644 > --- a/fs/f2fs/segment.h > +++ b/fs/f2fs/segment.h > @@ -215,7 +215,7 @@ struct segment_allocation { > #define IS_DUMMY_WRITTEN_PAGE(page) \ > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > -#define MAX_SKIP_ATOMIC_COUNT 16 > +#define MAX_SKIP_GC_COUNT 16 > > struct inmem_pages { > struct list_head list; > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v2] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-05 14:42 ` [f2fs-dev] " Chao Yu @ 2018-08-05 16:08 ` Jaegeuk Kim 0 siblings, 0 replies; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-05 16:08 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 08/05, Chao Yu wrote: > On 2018/8/5 6:29, Jaegeuk Kim wrote: > > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > > > If it hits the miximum retrials in GC, let's give a chance to release > > gc_mutex for a short time in order not to go into live lock in the worst > > case. > > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > > --- > > v2 log from v1: > > - keep lock order: i_gc_rwsem -> lock_op > > > > fs/f2fs/f2fs.h | 1 + > > fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- > > fs/f2fs/gc.c | 22 ++++++++++---- > > fs/f2fs/segment.c | 5 +++- > > fs/f2fs/segment.h | 2 +- > > 5 files changed, 61 insertions(+), 45 deletions(-) > > > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > > index a9447c7d6570..50349780001b 100644 > > --- a/fs/f2fs/f2fs.h > > +++ b/fs/f2fs/f2fs.h > > @@ -1223,6 +1223,7 @@ struct f2fs_sb_info { > > unsigned int gc_mode; /* current GC state */ > > /* for skip statistic */ > > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > > > /* threshold for gc trials on pinned files */ > > u64 gc_pin_file_threshold; > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > > index 78c1bd6b8497..7bcf47f41ef1 100644 > > --- a/fs/f2fs/file.c > > +++ b/fs/f2fs/file.c > > @@ -1179,25 +1179,31 @@ static int __exchange_data_block(struct inode *src_inode, > > return ret; > > } > > > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > > { > > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + pgoff_t start = offset >> PAGE_SHIFT; > > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > > int ret; > > > > f2fs_balance_fs(sbi, true); > > - f2fs_lock_op(sbi); > > > > - f2fs_drop_extent_tree(inode); > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > + f2fs_lock_op(sbi); > > + f2fs_drop_extent_tree(inode); > > + truncate_pagecache(inode, offset); > > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > > f2fs_unlock_op(sbi); > > + > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > { > > - pgoff_t pg_start, pg_end; > > loff_t new_size; > > int ret; > > > > @@ -1212,21 +1218,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > return ret; > > > > - pg_start = offset >> PAGE_SHIFT; > > - pg_end = (offset + len) >> PAGE_SHIFT; > > - > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > /* write out all dirty pages from offset */ > > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > if (ret) > > goto out_unlock; > > > > - truncate_pagecache(inode, offset); > > - > > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > > + ret = f2fs_do_collapse(inode, offset, len); > > if (ret) > > goto out_unlock; > > > > @@ -1242,7 +1240,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out_unlock: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1417,9 +1414,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > > > f2fs_balance_fs(sbi, true); > > > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > > if (ret) > > @@ -1430,13 +1424,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > goto out; > > > > - truncate_pagecache(inode, offset); > > - > > pg_start = offset >> PAGE_SHIFT; > > pg_end = (offset + len) >> PAGE_SHIFT; > > delta = pg_end - pg_start; > > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + truncate_pagecache(inode, offset); > > + > > while (!ret && idx > pg_start) { > > nr = idx - pg_start; > > if (nr > delta) > > @@ -1450,6 +1446,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > idx + delta, nr, false); > > f2fs_unlock_op(sbi); > > } > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > /* write out all moved pages, if possible */ > > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > @@ -1459,7 +1456,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1706,8 +1702,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > > > inode_lock(inode); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > if (f2fs_is_atomic_file(inode)) { > > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > > ret = -EINVAL; > > @@ -1718,6 +1712,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > if (ret) > > goto out; > > > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + > > if (!get_dirty_pages(inode)) > > goto skip_flush; > > > > @@ -1725,18 +1721,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > "Unexpected flush for atomic writes: ino=%lu, npages=%u", > > inode->i_ino, get_dirty_pages(inode)); > > ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); > > - if (ret) > > + if (ret) { > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > goto out; > > + } > > skip_flush: > > set_inode_flag(inode, FI_ATOMIC_FILE); > > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > > - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > > F2FS_I(inode)->inmem_task = current; > > stat_inc_atomic_write(inode); > > stat_update_max_atomic_write(inode); > > out: > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -1754,9 +1752,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > if (ret) > > return ret; > > > > - inode_lock(inode); > > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + inode_lock(inode); > > > > if (f2fs_is_volatile_file(inode)) { > > ret = -EINVAL; > > @@ -1782,7 +1780,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > > ret = -EINVAL; > > } > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -2378,15 +2375,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > } > > > > inode_lock(src); > > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > if (src != dst) { > > ret = -EBUSY; > > if (!inode_trylock(dst)) > > goto out; > > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > > - inode_unlock(dst); > > - goto out; > > - } > > } > > > > ret = -EINVAL; > > @@ -2431,6 +2423,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > goto out_unlock; > > > > f2fs_balance_fs(sbi, true); > > + > > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > + if (src != dst) { > > + ret = -EBUSY; > > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > > + goto out_src; > > + } > > + > > f2fs_lock_op(sbi); > > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > > pos_out >> F2FS_BLKSIZE_BITS, > > @@ -2443,13 +2443,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > f2fs_i_size_write(dst, dst_osize); > > } > > f2fs_unlock_op(sbi); > > -out_unlock: > > - if (src != dst) { > > + > > + if (src != dst) > > up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > > +out_src: > > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > +out_unlock: > > + if (src != dst) > > inode_unlock(dst); > > - } > > out: > > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > inode_unlock(src); > > return ret; > > } > > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > > index e352fbd33848..cac317e37306 100644 > > --- a/fs/f2fs/gc.c > > +++ b/fs/f2fs/gc.c > > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > if (!down_write_trylock( > > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > > iput(inode); > > + sbi->skipped_gc_rwsem++; > > continue; > > } > > > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > continue; > > if (!down_write_trylock( > > &fi->i_gc_rwsem[WRITE])) { > > + sbi->skipped_gc_rwsem++; > > up_write(&fi->i_gc_rwsem[READ]); > > continue; > > } > > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > prefree_segments(sbi)); > > > > cpc.reason = __get_cp_reason(sbi); > > + sbi->skipped_gc_rwsem = 0; > > gc_more: > > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > > ret = -EINVAL; > > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > total_freed += seg_freed; > > > > if (gc_type == FG_GC) { > > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > > + sbi->skipped_gc_rwsem) > > There should be other foreground ops like DIO can race with GC, in such case, if > we increase sbi->skipped_gc_rwsem, and call f2fs_drop_inmem_pages_all() to drop > inmem pages later, it doesn't help? Let me add a condition to decide that. > > > skipped_round++; > > last_skipped = sbi->skipped_atomic_files[FG_GC]; > > round++; > > @@ -1112,15 +1116,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > if (gc_type == FG_GC) > > sbi->cur_victim_sec = NULL_SEGNO; > > > > - if (!sync) { > > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > > - skipped_round * 2 >= round) > > - f2fs_drop_inmem_pages_all(sbi, true); > > + if (sync) > > + goto stop; > > + > > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > + if (skipped_round <= MAX_SKIP_GC_COUNT || > > + skipped_round * 2 < round) { > > segno = NULL_SEGNO; > > goto gc_more; > > } > > > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped) { > > + f2fs_drop_inmem_pages_all(sbi, true); > > + segno = NULL_SEGNO; > > + goto gc_more; > > + } > > if (gc_type == FG_GC) > > ret = f2fs_write_checkpoint(sbi, &cpc); > > } > > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > > index 3662e1f429b4..15b3b095fd58 100644 > > --- a/fs/f2fs/segment.c > > +++ b/fs/f2fs/segment.c > > @@ -444,10 +444,12 @@ int f2fs_commit_inmem_pages(struct inode *inode) > > struct f2fs_inode_info *fi = F2FS_I(inode); > > int err; > > > > - f2fs_balance_fs(sbi, true); > > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > We don't need to change sbi to F2FS_I_SB(inode). > > down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); Done. > > Thanks, > > > + > > f2fs_lock_op(sbi); > > > > set_inode_flag(inode, FI_ATOMIC_COMMIT); > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > mutex_lock(&fi->inmem_lock); > > err = __f2fs_commit_inmem_pages(inode); > > @@ -458,6 +460,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) > > spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); > > mutex_unlock(&fi->inmem_lock); > > > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > > > f2fs_unlock_op(sbi); > > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > > index 50495515f0a0..b3d9e317ff0c 100644 > > --- a/fs/f2fs/segment.h > > +++ b/fs/f2fs/segment.h > > @@ -215,7 +215,7 @@ struct segment_allocation { > > #define IS_DUMMY_WRITTEN_PAGE(page) \ > > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > > > -#define MAX_SKIP_ATOMIC_COUNT 16 > > +#define MAX_SKIP_GC_COUNT 16 > > > > struct inmem_pages { > > struct list_head list; > > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-04 22:29 ` [PATCH v2] " Jaegeuk Kim 2018-08-05 14:42 ` [f2fs-dev] " Chao Yu @ 2018-08-05 16:34 ` Jaegeuk Kim 2018-08-09 1:44 ` Chao Yu 1 sibling, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-05 16:34 UTC (permalink / raw) To: linux-kernel, linux-f2fs-devel The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- v3 - don't revoke atomic_writes all the time due to gc_rwsem - keep lock order in f2fs_commit_inmem_pages fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- fs/f2fs/gc.c | 24 +++++++++++---- fs/f2fs/segment.c | 6 +++- fs/f2fs/segment.h | 2 +- 5 files changed, 64 insertions(+), 45 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1647a13be7f9..0c65c3146ead 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1234,6 +1234,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 67c9c2d4e2d9..4e2e3938f474 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1180,25 +1180,31 @@ static int __exchange_data_block(struct inode *src_inode, return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1213,21 +1219,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) goto out_unlock; - truncate_pagecache(inode, offset); - - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) goto out_unlock; @@ -1243,7 +1241,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1418,9 +1415,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1431,13 +1425,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1451,6 +1447,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1460,7 +1457,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1707,8 +1703,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1719,6 +1713,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1726,18 +1722,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1755,9 +1753,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1783,7 +1781,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2379,15 +2376,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_unlock; f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + f2fs_lock_op(sbi); ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, @@ -2444,13 +2444,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_i_size_write(dst, dst_osize); } f2fs_unlock_op(sbi); -out_unlock: - if (src != dst) { + + if (src != dst) up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e352fbd33848..d816c328f02b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1112,15 +1116,23 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (sbi->skipped_atomic_files[FG_GC] == last_skipped && + sbi->skipped_atomic_files[FG_GC] > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3662e1f429b4..f4ac006dac43 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -445,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) int err; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + down_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); @@ -461,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); + up_write(&fi->i_gc_rwsem[WRITE]); + return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-05 16:34 ` [f2fs-dev] [PATCH v3] " Jaegeuk Kim @ 2018-08-09 1:44 ` Chao Yu 2018-08-09 19:59 ` Jaegeuk Kim 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-08-09 1:44 UTC (permalink / raw) To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel On 2018/8/6 0:34, Jaegeuk Kim wrote: > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > If it hits the miximum retrials in GC, let's give a chance to release > gc_mutex for a short time in order not to go into live lock in the worst > case. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > --- > v3 > - don't revoke atomic_writes all the time due to gc_rwsem > - keep lock order in f2fs_commit_inmem_pages > > fs/f2fs/f2fs.h | 1 + > fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- > fs/f2fs/gc.c | 24 +++++++++++---- > fs/f2fs/segment.c | 6 +++- > fs/f2fs/segment.h | 2 +- > 5 files changed, 64 insertions(+), 45 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index 1647a13be7f9..0c65c3146ead 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -1234,6 +1234,7 @@ struct f2fs_sb_info { > unsigned int gc_mode; /* current GC state */ > /* for skip statistic */ > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > /* threshold for gc trials on pinned files */ > u64 gc_pin_file_threshold; > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 67c9c2d4e2d9..4e2e3938f474 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -1180,25 +1180,31 @@ static int __exchange_data_block(struct inode *src_inode, > return ret; > } > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > { > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > + pgoff_t start = offset >> PAGE_SHIFT; > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > int ret; > > f2fs_balance_fs(sbi, true); > - f2fs_lock_op(sbi); > > - f2fs_drop_extent_tree(inode); > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + f2fs_lock_op(sbi); > + f2fs_drop_extent_tree(inode); > + truncate_pagecache(inode, offset); > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > f2fs_unlock_op(sbi); > + > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > { > - pgoff_t pg_start, pg_end; > loff_t new_size; > int ret; > > @@ -1213,21 +1219,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - pg_start = offset >> PAGE_SHIFT; > - pg_end = (offset + len) >> PAGE_SHIFT; > - > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > /* write out all dirty pages from offset */ > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > if (ret) > goto out_unlock; > > - truncate_pagecache(inode, offset); > - > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > + ret = f2fs_do_collapse(inode, offset, len); > if (ret) > goto out_unlock; > > @@ -1243,7 +1241,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out_unlock: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1418,9 +1415,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_balance_fs(sbi, true); > > - /* avoid gc operation during block exchange */ > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > down_write(&F2FS_I(inode)->i_mmap_sem); > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > if (ret) > @@ -1431,13 +1425,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > if (ret) > goto out; > > - truncate_pagecache(inode, offset); > - > pg_start = offset >> PAGE_SHIFT; > pg_end = (offset + len) >> PAGE_SHIFT; > delta = pg_end - pg_start; > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + /* avoid gc operation during block exchange */ > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + truncate_pagecache(inode, offset); > + > while (!ret && idx > pg_start) { > nr = idx - pg_start; > if (nr > delta) > @@ -1451,6 +1447,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > idx + delta, nr, false); > f2fs_unlock_op(sbi); > } > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > /* write out all moved pages, if possible */ > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > @@ -1460,7 +1457,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > f2fs_i_size_write(inode, new_size); > out: > up_write(&F2FS_I(inode)->i_mmap_sem); > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > return ret; > } > > @@ -1707,8 +1703,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > inode_lock(inode); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > - > if (f2fs_is_atomic_file(inode)) { > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > ret = -EINVAL; > @@ -1719,6 +1713,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > if (ret) > goto out; > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + > if (!get_dirty_pages(inode)) > goto skip_flush; > > @@ -1726,18 +1722,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > "Unexpected flush for atomic writes: ino=%lu, npages=%u", > inode->i_ino, get_dirty_pages(inode)); > ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); > - if (ret) > + if (ret) { > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > goto out; > + } > skip_flush: > set_inode_flag(inode, FI_ATOMIC_FILE); > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > F2FS_I(inode)->inmem_task = current; > stat_inc_atomic_write(inode); > stat_update_max_atomic_write(inode); > out: > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -1755,9 +1753,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > if (ret) > return ret; > > - inode_lock(inode); > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > + inode_lock(inode); > > if (f2fs_is_volatile_file(inode)) { > ret = -EINVAL; > @@ -1783,7 +1781,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > ret = -EINVAL; > } > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > inode_unlock(inode); > mnt_drop_write_file(filp); > return ret; > @@ -2379,15 +2376,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > } > > inode_lock(src); > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > if (src != dst) { > ret = -EBUSY; > if (!inode_trylock(dst)) > goto out; > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > - inode_unlock(dst); > - goto out; > - } > } > > ret = -EINVAL; > @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > goto out_unlock; > > f2fs_balance_fs(sbi, true); > + > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > + if (src != dst) { > + ret = -EBUSY; > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > + goto out_src; > + } > + > f2fs_lock_op(sbi); > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > pos_out >> F2FS_BLKSIZE_BITS, > @@ -2444,13 +2444,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > f2fs_i_size_write(dst, dst_osize); > } > f2fs_unlock_op(sbi); > -out_unlock: > - if (src != dst) { > + > + if (src != dst) > up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > +out_src: > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > +out_unlock: > + if (src != dst) > inode_unlock(dst); > - } > out: > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > inode_unlock(src); > return ret; > } > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > index e352fbd33848..d816c328f02b 100644 > --- a/fs/f2fs/gc.c > +++ b/fs/f2fs/gc.c > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > if (!down_write_trylock( > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > iput(inode); > + sbi->skipped_gc_rwsem++; > continue; > } > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > continue; > if (!down_write_trylock( > &fi->i_gc_rwsem[WRITE])) { > + sbi->skipped_gc_rwsem++; > up_write(&fi->i_gc_rwsem[READ]); > continue; > } > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > prefree_segments(sbi)); > > cpc.reason = __get_cp_reason(sbi); > + sbi->skipped_gc_rwsem = 0; > gc_more: > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > ret = -EINVAL; > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > total_freed += seg_freed; > > if (gc_type == FG_GC) { > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > + sbi->skipped_gc_rwsem) > skipped_round++; > last_skipped = sbi->skipped_atomic_files[FG_GC]; > round++; > @@ -1112,15 +1116,23 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > if (gc_type == FG_GC) > sbi->cur_victim_sec = NULL_SEGNO; > > - if (!sync) { > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > - skipped_round * 2 >= round) > - f2fs_drop_inmem_pages_all(sbi, true); > + if (sync) > + goto stop; > + > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > + if (skipped_round <= MAX_SKIP_GC_COUNT || > + skipped_round * 2 < round) { > segno = NULL_SEGNO; > goto gc_more; > } > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped && For fggc, sbi->skipped_atomic_files[FG_GC] should always be last_skipped, since we assigned above. > + sbi->skipped_atomic_files[FG_GC] > > + sbi->skipped_gc_rwsem) { > + f2fs_drop_inmem_pages_all(sbi, true); > + segno = NULL_SEGNO; > + goto gc_more; I guess what you mean is: atomic_skipped = sbi->skipped_atomic_files[FG_GC] - last_skipped; if (atomic_skipped && atomic_skipped > sbi->skipped_gc_rwsem) f2fs_drop_inmem_pages_all(); Is that right? diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 870b337e9f7f..40f103508432 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1142,7 +1142,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned int skipped_round = 0, round = 0; + unsigned int skipped_round = 0, round = 0, atomic_skipped; trace_f2fs_gc_begin(sbi->sb, sync, background, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1196,8 +1196,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) + atomic_skipped = + sbi->skipped_atomic_files[FG_GC] - last_skipped; + if (atomic_skipped || sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1216,9 +1217,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, goto gc_more; } - if (sbi->skipped_atomic_files[FG_GC] == last_skipped && - sbi->skipped_atomic_files[FG_GC] > - sbi->skipped_gc_rwsem) { + if (atomic_skipped && atomic_skipped > sbi->skipped_gc_rwsem) { f2fs_drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; Thanks, > + } > if (gc_type == FG_GC) > ret = f2fs_write_checkpoint(sbi, &cpc); > } > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c > index 3662e1f429b4..f4ac006dac43 100644 > --- a/fs/f2fs/segment.c > +++ b/fs/f2fs/segment.c > @@ -445,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) > int err; > > f2fs_balance_fs(sbi, true); > - f2fs_lock_op(sbi); > > + down_write(&fi->i_gc_rwsem[WRITE]); > + > + f2fs_lock_op(sbi); > set_inode_flag(inode, FI_ATOMIC_COMMIT); > > mutex_lock(&fi->inmem_lock); > @@ -461,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode) > clear_inode_flag(inode, FI_ATOMIC_COMMIT); > > f2fs_unlock_op(sbi); > + up_write(&fi->i_gc_rwsem[WRITE]); > + > return err; > } > > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h > index 50495515f0a0..b3d9e317ff0c 100644 > --- a/fs/f2fs/segment.h > +++ b/fs/f2fs/segment.h > @@ -215,7 +215,7 @@ struct segment_allocation { > #define IS_DUMMY_WRITTEN_PAGE(page) \ > (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) > > -#define MAX_SKIP_ATOMIC_COUNT 16 > +#define MAX_SKIP_GC_COUNT 16 > > struct inmem_pages { > struct list_head list; > ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-09 1:44 ` Chao Yu @ 2018-08-09 19:59 ` Jaegeuk Kim 2018-08-10 1:36 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-09 19:59 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 08/09, Chao Yu wrote: > On 2018/8/6 0:34, Jaegeuk Kim wrote: > > The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of > > fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. > > > > If it hits the miximum retrials in GC, let's give a chance to release > > gc_mutex for a short time in order not to go into live lock in the worst > > case. > > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > > --- > > v3 > > - don't revoke atomic_writes all the time due to gc_rwsem > > - keep lock order in f2fs_commit_inmem_pages > > > > fs/f2fs/f2fs.h | 1 + > > fs/f2fs/file.c | 76 ++++++++++++++++++++++++----------------------- > > fs/f2fs/gc.c | 24 +++++++++++---- > > fs/f2fs/segment.c | 6 +++- > > fs/f2fs/segment.h | 2 +- > > 5 files changed, 64 insertions(+), 45 deletions(-) > > > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > > index 1647a13be7f9..0c65c3146ead 100644 > > --- a/fs/f2fs/f2fs.h > > +++ b/fs/f2fs/f2fs.h > > @@ -1234,6 +1234,7 @@ struct f2fs_sb_info { > > unsigned int gc_mode; /* current GC state */ > > /* for skip statistic */ > > unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ > > + unsigned long long skipped_gc_rwsem; /* FG_GC only */ > > > > /* threshold for gc trials on pinned files */ > > u64 gc_pin_file_threshold; > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > > index 67c9c2d4e2d9..4e2e3938f474 100644 > > --- a/fs/f2fs/file.c > > +++ b/fs/f2fs/file.c > > @@ -1180,25 +1180,31 @@ static int __exchange_data_block(struct inode *src_inode, > > return ret; > > } > > > > -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) > > +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) > > { > > struct f2fs_sb_info *sbi = F2FS_I_SB(inode); > > pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > + pgoff_t start = offset >> PAGE_SHIFT; > > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > > int ret; > > > > f2fs_balance_fs(sbi, true); > > - f2fs_lock_op(sbi); > > > > - f2fs_drop_extent_tree(inode); > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > + f2fs_lock_op(sbi); > > + f2fs_drop_extent_tree(inode); > > + truncate_pagecache(inode, offset); > > ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); > > f2fs_unlock_op(sbi); > > + > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > { > > - pgoff_t pg_start, pg_end; > > loff_t new_size; > > int ret; > > > > @@ -1213,21 +1219,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > return ret; > > > > - pg_start = offset >> PAGE_SHIFT; > > - pg_end = (offset + len) >> PAGE_SHIFT; > > - > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > /* write out all dirty pages from offset */ > > ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > if (ret) > > goto out_unlock; > > > > - truncate_pagecache(inode, offset); > > - > > - ret = f2fs_do_collapse(inode, pg_start, pg_end); > > + ret = f2fs_do_collapse(inode, offset, len); > > if (ret) > > goto out_unlock; > > > > @@ -1243,7 +1241,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out_unlock: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1418,9 +1415,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > > > f2fs_balance_fs(sbi, true); > > > > - /* avoid gc operation during block exchange */ > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > down_write(&F2FS_I(inode)->i_mmap_sem); > > ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); > > if (ret) > > @@ -1431,13 +1425,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > if (ret) > > goto out; > > > > - truncate_pagecache(inode, offset); > > - > > pg_start = offset >> PAGE_SHIFT; > > pg_end = (offset + len) >> PAGE_SHIFT; > > delta = pg_end - pg_start; > > idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; > > > > + /* avoid gc operation during block exchange */ > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + truncate_pagecache(inode, offset); > > + > > while (!ret && idx > pg_start) { > > nr = idx - pg_start; > > if (nr > delta) > > @@ -1451,6 +1447,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > idx + delta, nr, false); > > f2fs_unlock_op(sbi); > > } > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > /* write out all moved pages, if possible */ > > filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); > > @@ -1460,7 +1457,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) > > f2fs_i_size_write(inode, new_size); > > out: > > up_write(&F2FS_I(inode)->i_mmap_sem); > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > return ret; > > } > > > > @@ -1707,8 +1703,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > > > inode_lock(inode); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > - > > if (f2fs_is_atomic_file(inode)) { > > if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) > > ret = -EINVAL; > > @@ -1719,6 +1713,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > if (ret) > > goto out; > > > > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + > > if (!get_dirty_pages(inode)) > > goto skip_flush; > > > > @@ -1726,18 +1722,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) > > "Unexpected flush for atomic writes: ino=%lu, npages=%u", > > inode->i_ino, get_dirty_pages(inode)); > > ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); > > - if (ret) > > + if (ret) { > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > goto out; > > + } > > skip_flush: > > set_inode_flag(inode, FI_ATOMIC_FILE); > > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > > - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > > + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > > > + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); > > F2FS_I(inode)->inmem_task = current; > > stat_inc_atomic_write(inode); > > stat_update_max_atomic_write(inode); > > out: > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -1755,9 +1753,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > if (ret) > > return ret; > > > > - inode_lock(inode); > > + f2fs_balance_fs(F2FS_I_SB(inode), true); > > > > - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > + inode_lock(inode); > > > > if (f2fs_is_volatile_file(inode)) { > > ret = -EINVAL; > > @@ -1783,7 +1781,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) > > clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); > > ret = -EINVAL; > > } > > - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); > > inode_unlock(inode); > > mnt_drop_write_file(filp); > > return ret; > > @@ -2379,15 +2376,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > } > > > > inode_lock(src); > > - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > if (src != dst) { > > ret = -EBUSY; > > if (!inode_trylock(dst)) > > goto out; > > - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { > > - inode_unlock(dst); > > - goto out; > > - } > > } > > > > ret = -EINVAL; > > @@ -2432,6 +2424,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > goto out_unlock; > > > > f2fs_balance_fs(sbi, true); > > + > > + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > + if (src != dst) { > > + ret = -EBUSY; > > + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) > > + goto out_src; > > + } > > + > > f2fs_lock_op(sbi); > > ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, > > pos_out >> F2FS_BLKSIZE_BITS, > > @@ -2444,13 +2444,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, > > f2fs_i_size_write(dst, dst_osize); > > } > > f2fs_unlock_op(sbi); > > -out_unlock: > > - if (src != dst) { > > + > > + if (src != dst) > > up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); > > +out_src: > > + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > +out_unlock: > > + if (src != dst) > > inode_unlock(dst); > > - } > > out: > > - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); > > inode_unlock(src); > > return ret; > > } > > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > > index e352fbd33848..d816c328f02b 100644 > > --- a/fs/f2fs/gc.c > > +++ b/fs/f2fs/gc.c > > @@ -884,6 +884,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > if (!down_write_trylock( > > &F2FS_I(inode)->i_gc_rwsem[WRITE])) { > > iput(inode); > > + sbi->skipped_gc_rwsem++; > > continue; > > } > > > > @@ -913,6 +914,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, > > continue; > > if (!down_write_trylock( > > &fi->i_gc_rwsem[WRITE])) { > > + sbi->skipped_gc_rwsem++; > > up_write(&fi->i_gc_rwsem[READ]); > > continue; > > } > > @@ -1062,6 +1064,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > prefree_segments(sbi)); > > > > cpc.reason = __get_cp_reason(sbi); > > + sbi->skipped_gc_rwsem = 0; > > gc_more: > > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > > ret = -EINVAL; > > @@ -1103,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > total_freed += seg_freed; > > > > if (gc_type == FG_GC) { > > - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) > > + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || > > + sbi->skipped_gc_rwsem) > > skipped_round++; > > last_skipped = sbi->skipped_atomic_files[FG_GC]; > > round++; > > @@ -1112,15 +1116,23 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > if (gc_type == FG_GC) > > sbi->cur_victim_sec = NULL_SEGNO; > > > > - if (!sync) { > > - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && > > - skipped_round * 2 >= round) > > - f2fs_drop_inmem_pages_all(sbi, true); > > + if (sync) > > + goto stop; > > + > > + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { > > + if (skipped_round <= MAX_SKIP_GC_COUNT || > > + skipped_round * 2 < round) { > > segno = NULL_SEGNO; > > goto gc_more; > > } > > > > + if (sbi->skipped_atomic_files[FG_GC] == last_skipped && > > For fggc, sbi->skipped_atomic_files[FG_GC] should always be last_skipped, since > we assigned above. > > > + sbi->skipped_atomic_files[FG_GC] > > > + sbi->skipped_gc_rwsem) { > > + f2fs_drop_inmem_pages_all(sbi, true); > > + segno = NULL_SEGNO; > > + goto gc_more; > > I guess what you mean is: > > atomic_skipped = sbi->skipped_atomic_files[FG_GC] - last_skipped; > > if (atomic_skipped && atomic_skipped > sbi->skipped_gc_rwsem) > f2fs_drop_inmem_pages_all(); > > Is that right? Yup, how about this? diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d816c328f02b..cb510fb36523 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1052,6 +1052,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1064,8 +1065,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); - sbi->skipped_gc_rwsem = 0; gc_more: + sbi->skipped_gc_rwsem = 0; + first_skipped = last_skipped; + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; @@ -1126,8 +1129,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, goto gc_more; } - if (sbi->skipped_atomic_files[FG_GC] == last_skipped && - sbi->skipped_atomic_files[FG_GC] > + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > sbi->skipped_gc_rwsem) { f2fs_drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-09 19:59 ` Jaegeuk Kim @ 2018-08-10 1:36 ` Chao Yu 2018-08-10 1:46 ` Jaegeuk Kim 0 siblings, 1 reply; 21+ messages in thread From: Chao Yu @ 2018-08-10 1:36 UTC (permalink / raw) To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel On 2018/8/10 3:59, Jaegeuk Kim wrote: > Yup, how about this? > > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > index d816c328f02b..cb510fb36523 100644 > --- a/fs/f2fs/gc.c > +++ b/fs/f2fs/gc.c > @@ -1052,6 +1052,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), > }; > unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; > + unsigned long long first_skipped; > unsigned int skipped_round = 0, round = 0; > > trace_f2fs_gc_begin(sbi->sb, sync, background, > @@ -1064,8 +1065,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > prefree_segments(sbi)); > > cpc.reason = __get_cp_reason(sbi); > - sbi->skipped_gc_rwsem = 0; > gc_more: > + sbi->skipped_gc_rwsem = 0; > + first_skipped = last_skipped; > + > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > ret = -EINVAL; > goto stop; > @@ -1126,8 +1129,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > goto gc_more; > } > > - if (sbi->skipped_atomic_files[FG_GC] == last_skipped && > - sbi->skipped_atomic_files[FG_GC] > > + if (first_skipped < last_skipped && > + (last_skipped - first_skipped) > IMO, it would be better to judge the condition with skipped number in all round of FGGC instead of last round, since number in last round may not very accurate. Thoughts? Thanks, > sbi->skipped_gc_rwsem) { > f2fs_drop_inmem_pages_all(sbi, true); > segno = NULL_SEGNO; > > . > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-10 1:36 ` Chao Yu @ 2018-08-10 1:46 ` Jaegeuk Kim 2018-08-20 7:13 ` Chao Yu 0 siblings, 1 reply; 21+ messages in thread From: Jaegeuk Kim @ 2018-08-10 1:46 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On 08/10, Chao Yu wrote: > On 2018/8/10 3:59, Jaegeuk Kim wrote: > > Yup, how about this? > > > > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c > > index d816c328f02b..cb510fb36523 100644 > > --- a/fs/f2fs/gc.c > > +++ b/fs/f2fs/gc.c > > @@ -1052,6 +1052,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), > > }; > > unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; > > + unsigned long long first_skipped; > > unsigned int skipped_round = 0, round = 0; > > > > trace_f2fs_gc_begin(sbi->sb, sync, background, > > @@ -1064,8 +1065,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > prefree_segments(sbi)); > > > > cpc.reason = __get_cp_reason(sbi); > > - sbi->skipped_gc_rwsem = 0; > > gc_more: > > + sbi->skipped_gc_rwsem = 0; > > + first_skipped = last_skipped; > > + > > if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { > > ret = -EINVAL; > > goto stop; > > @@ -1126,8 +1129,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, > > goto gc_more; > > } > > > > - if (sbi->skipped_atomic_files[FG_GC] == last_skipped && > > - sbi->skipped_atomic_files[FG_GC] > > > + if (first_skipped < last_skipped && > > + (last_skipped - first_skipped) > > > IMO, it would be better to judge the condition with skipped number in all round > of FGGC instead of last round, since number in last round may not very accurate. Yup, moved before gc_more. > > Thoughts? > > Thanks, > > > sbi->skipped_gc_rwsem) { > > f2fs_drop_inmem_pages_all(sbi, true); > > segno = NULL_SEGNO; > > > > . > > ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [f2fs-dev] [PATCH v3] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc 2018-08-10 1:46 ` Jaegeuk Kim @ 2018-08-20 7:13 ` Chao Yu 0 siblings, 0 replies; 21+ messages in thread From: Chao Yu @ 2018-08-20 7:13 UTC (permalink / raw) To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel Hi Jaegeuk, Please add: Reviewed-by: Chao Yu <yuchao0@huawei.com> Thanks, On 2018/8/10 9:46, Jaegeuk Kim wrote: > On 08/10, Chao Yu wrote: >> On 2018/8/10 3:59, Jaegeuk Kim wrote: >>> Yup, how about this? >>> >>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c >>> index d816c328f02b..cb510fb36523 100644 >>> --- a/fs/f2fs/gc.c >>> +++ b/fs/f2fs/gc.c >>> @@ -1052,6 +1052,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), >>> }; >>> unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; >>> + unsigned long long first_skipped; >>> unsigned int skipped_round = 0, round = 0; >>> >>> trace_f2fs_gc_begin(sbi->sb, sync, background, >>> @@ -1064,8 +1065,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> prefree_segments(sbi)); >>> >>> cpc.reason = __get_cp_reason(sbi); >>> - sbi->skipped_gc_rwsem = 0; >>> gc_more: >>> + sbi->skipped_gc_rwsem = 0; >>> + first_skipped = last_skipped; >>> + >>> if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { >>> ret = -EINVAL; >>> goto stop; >>> @@ -1126,8 +1129,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, >>> goto gc_more; >>> } >>> >>> - if (sbi->skipped_atomic_files[FG_GC] == last_skipped && >>> - sbi->skipped_atomic_files[FG_GC] > >>> + if (first_skipped < last_skipped && >>> + (last_skipped - first_skipped) > >> >> IMO, it would be better to judge the condition with skipped number in all round >> of FGGC instead of last round, since number in last round may not very accurate. > > Yup, moved before gc_more. > >> >> Thoughts? >> >> Thanks, >> >>> sbi->skipped_gc_rwsem) { >>> f2fs_drop_inmem_pages_all(sbi, true); >>> segno = NULL_SEGNO; >>> >>> . >>> > > . > ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc @ 2018-07-25 3:22 Jaegeuk Kim 0 siblings, 0 replies; 21+ messages in thread From: Jaegeuk Kim @ 2018-07-25 3:22 UTC (permalink / raw) To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/file.c | 2 ++ fs/f2fs/segment.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84293423f830..3a5c35fa0603 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1754,6 +1754,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; + f2fs_balance_fs(F2FS_I_SB(inode), true); + inode_lock(inode); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d28fa03a115f..17354089b4ab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -445,7 +445,6 @@ int f2fs_commit_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); -- 2.17.0.441.gb46fe60e1d-goog ^ permalink raw reply related [flat|nested] 21+ messages in thread
end of thread, other threads:[~2018-08-20 7:13 UTC | newest] Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2018-07-30 1:32 [PATCH] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc Jaegeuk Kim 2018-07-30 3:29 ` Chao Yu 2018-07-30 4:18 ` Jaegeuk Kim 2018-07-30 6:36 ` Chao Yu 2018-07-30 9:08 ` Jaegeuk Kim 2018-07-30 9:28 ` Chao Yu 2018-07-30 10:00 ` Jaegeuk Kim 2018-08-04 2:31 ` [f2fs-dev] " Chao Yu 2018-08-12 10:24 ` Chao Yu 2018-08-13 20:12 ` Jaegeuk Kim 2018-08-14 6:21 ` Chao Yu 2018-08-04 22:29 ` [PATCH v2] " Jaegeuk Kim 2018-08-05 14:42 ` [f2fs-dev] " Chao Yu 2018-08-05 16:08 ` Jaegeuk Kim 2018-08-05 16:34 ` [f2fs-dev] [PATCH v3] " Jaegeuk Kim 2018-08-09 1:44 ` Chao Yu 2018-08-09 19:59 ` Jaegeuk Kim 2018-08-10 1:36 ` Chao Yu 2018-08-10 1:46 ` Jaegeuk Kim 2018-08-20 7:13 ` Chao Yu -- strict thread matches above, loose matches on Subject: below -- 2018-07-25 3:22 [PATCH] " Jaegeuk Kim
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).