All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-28  1:51 ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-28  1:51 UTC (permalink / raw)
  To: linux-f2fs-devel, Jaegeuk Kim, Chao Yu; +Cc: linux-fsdevel, stable

From: Eric Biggers <ebiggers@google.com>

Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
they require preallocating blocks, but f2fs doesn't support unwritten
blocks and therefore has to preallocate the blocks as regular blocks.
f2fs has no way to reliably roll back such preallocations, so as a
result, f2fs will leak uninitialized blocks to users if a DIO write
doesn't fully complete.  This can be easily reproduced by issuing a DIO
write that will fail due to misalignment, e.g.:

	rm -f file
	truncate -s 1000000 file
	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
	od -tx1 file  # shows uninitialized disk blocks

Until a proper design for non-overwrite DIO writes on f2fs can be
designed and implemented, remove support for them and make them fall
back to buffered I/O.  This is what other filesystems that don't support
unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
writes.  However, f2fs can't do extending DIO writes either, as f2fs
appears to have no mechanism for guaranteeing that leftover allocated
blocks past EOF will get truncated.  (f2fs does have an orphan list, but
it's only used for deleting inodes, not truncating them.)

This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
cases in f2fs_map_blocks(); that can be cleaned up later.

Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

This applies to the latest f2fs.git#dev branch.

 fs/f2fs/data.c | 82 +++++++++++---------------------------------------
 fs/f2fs/f2fs.h | 24 +++++----------
 fs/f2fs/file.c | 11 ++-----
 3 files changed, 28 insertions(+), 89 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cb71d7317ad2..756b2277bf1b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1374,9 +1374,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct f2fs_map_blocks map;
-	int flag;
 	int err = 0;
-	bool direct_io = iocb->ki_flags & IOCB_DIRECT;
 
 	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
 	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -1390,13 +1388,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_seg_type = NO_CHECK_TYPE;
 	map.m_may_create = true;
 
-	if (direct_io) {
-		map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
-		flag = f2fs_force_buffered_io(inode, iocb, from) ?
-					F2FS_GET_BLOCK_PRE_AIO :
-					F2FS_GET_BLOCK_PRE_DIO;
-		goto map_blocks;
-	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
@@ -1405,13 +1396,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	if (f2fs_has_inline_data(inode))
 		return err;
 
-	flag = F2FS_GET_BLOCK_PRE_AIO;
-
-map_blocks:
-	err = f2fs_map_blocks(inode, &map, 1, flag);
+	err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
 	if (map.m_len > 0 && err == -ENOSPC) {
-		if (!direct_io)
-			set_inode_flag(inode, FI_NO_PREALLOC);
+		set_inode_flag(inode, FI_NO_PREALLOC);
 		err = 0;
 	}
 	return err;
@@ -1701,47 +1688,30 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks)
 	return (blks << inode->i_blkbits);
 }
 
-static int __get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh, int create, int flag,
-			pgoff_t *next_pgofs, int seg_type, bool may_write)
+static int get_data_block_dio(struct inode *inode, sector_t iblock,
+			      struct buffer_head *bh, int create)
 {
-	struct f2fs_map_blocks map;
+	struct f2fs_map_blocks map = {
+		.m_lblk = iblock,
+		.m_len = bytes_to_blks(inode, bh->b_size),
+	};
 	int err;
 
-	map.m_lblk = iblock;
-	map.m_len = bytes_to_blks(inode, bh->b_size);
-	map.m_next_pgofs = next_pgofs;
-	map.m_next_extent = NULL;
-	map.m_seg_type = seg_type;
-	map.m_may_create = may_write;
-
-	err = f2fs_map_blocks(inode, &map, create, flag);
+	/*
+	 * We don't allocate blocks here, as that would expose uninitialized
+	 * blocks if the DIO write doesn't fully complete.  DIO writes that need
+	 * to allocate blocks will fall back to buffered writes.
+	 */
+	err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
 	if (!err) {
-		map_bh(bh, inode->i_sb, map.m_pblk);
+		if (map.m_flags & F2FS_MAP_MAPPED)
+			map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
 		bh->b_size = blks_to_bytes(inode, map.m_len);
 	}
 	return err;
 }
 
-static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	return __get_data_block(inode, iblock, bh_result, create,
-				F2FS_GET_BLOCK_DIO, NULL,
-				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
-				true);
-}
-
-static int get_data_block_dio(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	return __get_data_block(inode, iblock, bh_result, create,
-				F2FS_GET_BLOCK_DIO, NULL,
-				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
-				false);
-}
-
 static int f2fs_xattr_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo)
 {
@@ -3556,7 +3526,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	int err;
 	enum rw_hint hint = iocb->ki_hint;
 	int whint_mode = F2FS_OPTION(sbi).whint_mode;
-	bool do_opu;
 
 	err = check_direct_IO(inode, iter, offset);
 	if (err)
@@ -3565,8 +3534,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (f2fs_force_buffered_io(inode, iocb, iter))
 		return 0;
 
-	do_opu = rw == WRITE && f2fs_lfs_mode(sbi);
-
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
 	if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
@@ -3578,27 +3545,15 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			err = -EAGAIN;
 			goto out;
 		}
-		if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
-			up_read(&fi->i_gc_rwsem[rw]);
-			iocb->ki_hint = hint;
-			err = -EAGAIN;
-			goto out;
-		}
 	} else {
 		down_read(&fi->i_gc_rwsem[rw]);
-		if (do_opu)
-			down_read(&fi->i_gc_rwsem[READ]);
 	}
 
 	err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-			iter, rw == WRITE ? get_data_block_dio_write :
-			get_data_block_dio, NULL, f2fs_dio_submit_bio,
+			iter, get_data_block_dio, NULL, f2fs_dio_submit_bio,
 			rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
 			DIO_SKIP_HOLES);
 
-	if (do_opu)
-		up_read(&fi->i_gc_rwsem[READ]);
-
 	up_read(&fi->i_gc_rwsem[rw]);
 
 	if (rw == WRITE) {
@@ -3607,8 +3562,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		if (err > 0) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 									err);
-			if (!do_opu)
-				set_inode_flag(inode, FI_UPDATE_WRITE);
+			set_inode_flag(inode, FI_UPDATE_WRITE);
 		} else if (err == -EIOCBQUEUED) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 						count - iov_iter_count(iter));
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5d16486feb8f..c99756a6ff5f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4302,17 +4302,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
 	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
-static inline int block_unaligned_IO(struct inode *inode,
-				struct kiocb *iocb, struct iov_iter *iter)
-{
-	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
-	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
-	loff_t offset = iocb->ki_pos;
-	unsigned long align = offset | iov_iter_alignment(iter);
-
-	return align & blocksize_mask;
-}
-
 static inline bool f2fs_force_buffered_io(struct inode *inode,
 				struct kiocb *iocb, struct iov_iter *iter)
 {
@@ -4329,12 +4318,13 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 	 */
 	if (f2fs_sb_has_blkzoned(sbi))
 		return true;
-	if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
-		if (block_unaligned_IO(inode, iocb, iter))
-			return true;
-		if (F2FS_IO_ALIGNED(sbi))
-			return true;
-	}
+	/*
+	 * DIO writes in LFS mode would require preallocating blocks, which is
+	 * hard to do without exposing uninitialized blocks after short writes.
+	 */
+	if (f2fs_lfs_mode(sbi) && rw == WRITE)
+		return true;
+
 	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
 		return true;
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f335d38efc76..745672ab5a2a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4287,14 +4287,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			err = f2fs_convert_inline_inode(inode);
 			if (err)
 				goto out_err;
-			/*
-			 * If force_buffere_io() is true, we have to allocate
-			 * blocks all the time, since f2fs_direct_IO will fall
-			 * back to buffered IO.
-			 */
-			if (!f2fs_force_buffered_io(inode, iocb, from) &&
-					f2fs_lfs_mode(F2FS_I_SB(inode)))
-				goto write;
+
+			set_inode_flag(inode, FI_NO_PREALLOC);
+			goto write;
 		}
 		preallocated = true;
 		target_size = iocb->ki_pos + iov_iter_count(from);

base-commit: 08b8de81abe13b595120973b3089c4958e3ff2da
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-28  1:51 ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-28  1:51 UTC (permalink / raw)
  To: linux-f2fs-devel, Jaegeuk Kim, Chao Yu; +Cc: linux-fsdevel, stable

From: Eric Biggers <ebiggers@google.com>

Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
they require preallocating blocks, but f2fs doesn't support unwritten
blocks and therefore has to preallocate the blocks as regular blocks.
f2fs has no way to reliably roll back such preallocations, so as a
result, f2fs will leak uninitialized blocks to users if a DIO write
doesn't fully complete.  This can be easily reproduced by issuing a DIO
write that will fail due to misalignment, e.g.:

	rm -f file
	truncate -s 1000000 file
	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
	od -tx1 file  # shows uninitialized disk blocks

Until a proper design for non-overwrite DIO writes on f2fs can be
designed and implemented, remove support for them and make them fall
back to buffered I/O.  This is what other filesystems that don't support
unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
writes.  However, f2fs can't do extending DIO writes either, as f2fs
appears to have no mechanism for guaranteeing that leftover allocated
blocks past EOF will get truncated.  (f2fs does have an orphan list, but
it's only used for deleting inodes, not truncating them.)

This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
cases in f2fs_map_blocks(); that can be cleaned up later.

Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

This applies to the latest f2fs.git#dev branch.

 fs/f2fs/data.c | 82 +++++++++++---------------------------------------
 fs/f2fs/f2fs.h | 24 +++++----------
 fs/f2fs/file.c | 11 ++-----
 3 files changed, 28 insertions(+), 89 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cb71d7317ad2..756b2277bf1b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1374,9 +1374,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct f2fs_map_blocks map;
-	int flag;
 	int err = 0;
-	bool direct_io = iocb->ki_flags & IOCB_DIRECT;
 
 	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
 	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -1390,13 +1388,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_seg_type = NO_CHECK_TYPE;
 	map.m_may_create = true;
 
-	if (direct_io) {
-		map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
-		flag = f2fs_force_buffered_io(inode, iocb, from) ?
-					F2FS_GET_BLOCK_PRE_AIO :
-					F2FS_GET_BLOCK_PRE_DIO;
-		goto map_blocks;
-	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
@@ -1405,13 +1396,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	if (f2fs_has_inline_data(inode))
 		return err;
 
-	flag = F2FS_GET_BLOCK_PRE_AIO;
-
-map_blocks:
-	err = f2fs_map_blocks(inode, &map, 1, flag);
+	err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
 	if (map.m_len > 0 && err == -ENOSPC) {
-		if (!direct_io)
-			set_inode_flag(inode, FI_NO_PREALLOC);
+		set_inode_flag(inode, FI_NO_PREALLOC);
 		err = 0;
 	}
 	return err;
@@ -1701,47 +1688,30 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks)
 	return (blks << inode->i_blkbits);
 }
 
-static int __get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh, int create, int flag,
-			pgoff_t *next_pgofs, int seg_type, bool may_write)
+static int get_data_block_dio(struct inode *inode, sector_t iblock,
+			      struct buffer_head *bh, int create)
 {
-	struct f2fs_map_blocks map;
+	struct f2fs_map_blocks map = {
+		.m_lblk = iblock,
+		.m_len = bytes_to_blks(inode, bh->b_size),
+	};
 	int err;
 
-	map.m_lblk = iblock;
-	map.m_len = bytes_to_blks(inode, bh->b_size);
-	map.m_next_pgofs = next_pgofs;
-	map.m_next_extent = NULL;
-	map.m_seg_type = seg_type;
-	map.m_may_create = may_write;
-
-	err = f2fs_map_blocks(inode, &map, create, flag);
+	/*
+	 * We don't allocate blocks here, as that would expose uninitialized
+	 * blocks if the DIO write doesn't fully complete.  DIO writes that need
+	 * to allocate blocks will fall back to buffered writes.
+	 */
+	err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
 	if (!err) {
-		map_bh(bh, inode->i_sb, map.m_pblk);
+		if (map.m_flags & F2FS_MAP_MAPPED)
+			map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
 		bh->b_size = blks_to_bytes(inode, map.m_len);
 	}
 	return err;
 }
 
-static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	return __get_data_block(inode, iblock, bh_result, create,
-				F2FS_GET_BLOCK_DIO, NULL,
-				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
-				true);
-}
-
-static int get_data_block_dio(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	return __get_data_block(inode, iblock, bh_result, create,
-				F2FS_GET_BLOCK_DIO, NULL,
-				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
-				false);
-}
-
 static int f2fs_xattr_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo)
 {
@@ -3556,7 +3526,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	int err;
 	enum rw_hint hint = iocb->ki_hint;
 	int whint_mode = F2FS_OPTION(sbi).whint_mode;
-	bool do_opu;
 
 	err = check_direct_IO(inode, iter, offset);
 	if (err)
@@ -3565,8 +3534,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (f2fs_force_buffered_io(inode, iocb, iter))
 		return 0;
 
-	do_opu = rw == WRITE && f2fs_lfs_mode(sbi);
-
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
 	if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
@@ -3578,27 +3545,15 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			err = -EAGAIN;
 			goto out;
 		}
-		if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
-			up_read(&fi->i_gc_rwsem[rw]);
-			iocb->ki_hint = hint;
-			err = -EAGAIN;
-			goto out;
-		}
 	} else {
 		down_read(&fi->i_gc_rwsem[rw]);
-		if (do_opu)
-			down_read(&fi->i_gc_rwsem[READ]);
 	}
 
 	err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-			iter, rw == WRITE ? get_data_block_dio_write :
-			get_data_block_dio, NULL, f2fs_dio_submit_bio,
+			iter, get_data_block_dio, NULL, f2fs_dio_submit_bio,
 			rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
 			DIO_SKIP_HOLES);
 
-	if (do_opu)
-		up_read(&fi->i_gc_rwsem[READ]);
-
 	up_read(&fi->i_gc_rwsem[rw]);
 
 	if (rw == WRITE) {
@@ -3607,8 +3562,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		if (err > 0) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 									err);
-			if (!do_opu)
-				set_inode_flag(inode, FI_UPDATE_WRITE);
+			set_inode_flag(inode, FI_UPDATE_WRITE);
 		} else if (err == -EIOCBQUEUED) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 						count - iov_iter_count(iter));
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5d16486feb8f..c99756a6ff5f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4302,17 +4302,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
 	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
-static inline int block_unaligned_IO(struct inode *inode,
-				struct kiocb *iocb, struct iov_iter *iter)
-{
-	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
-	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
-	loff_t offset = iocb->ki_pos;
-	unsigned long align = offset | iov_iter_alignment(iter);
-
-	return align & blocksize_mask;
-}
-
 static inline bool f2fs_force_buffered_io(struct inode *inode,
 				struct kiocb *iocb, struct iov_iter *iter)
 {
@@ -4329,12 +4318,13 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 	 */
 	if (f2fs_sb_has_blkzoned(sbi))
 		return true;
-	if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
-		if (block_unaligned_IO(inode, iocb, iter))
-			return true;
-		if (F2FS_IO_ALIGNED(sbi))
-			return true;
-	}
+	/*
+	 * DIO writes in LFS mode would require preallocating blocks, which is
+	 * hard to do without exposing uninitialized blocks after short writes.
+	 */
+	if (f2fs_lfs_mode(sbi) && rw == WRITE)
+		return true;
+
 	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
 		return true;
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f335d38efc76..745672ab5a2a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4287,14 +4287,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			err = f2fs_convert_inline_inode(inode);
 			if (err)
 				goto out_err;
-			/*
-			 * If force_buffere_io() is true, we have to allocate
-			 * blocks all the time, since f2fs_direct_IO will fall
-			 * back to buffered IO.
-			 */
-			if (!f2fs_force_buffered_io(inode, iocb, from) &&
-					f2fs_lfs_mode(F2FS_I_SB(inode)))
-				goto write;
+
+			set_inode_flag(inode, FI_NO_PREALLOC);
+			goto write;
 		}
 		preallocated = true;
 		target_size = iocb->ki_pos + iov_iter_count(from);

base-commit: 08b8de81abe13b595120973b3089c4958e3ff2da
-- 
2.32.0



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-28  1:51 ` [f2fs-dev] " Eric Biggers
@ 2021-07-30 19:17   ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-30 19:17 UTC (permalink / raw)
  To: linux-f2fs-devel, Jaegeuk Kim, Chao Yu; +Cc: linux-fsdevel, stable

On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> they require preallocating blocks, but f2fs doesn't support unwritten
> blocks and therefore has to preallocate the blocks as regular blocks.
> f2fs has no way to reliably roll back such preallocations, so as a
> result, f2fs will leak uninitialized blocks to users if a DIO write
> doesn't fully complete.  This can be easily reproduced by issuing a DIO
> write that will fail due to misalignment, e.g.:
> 
> 	rm -f file
> 	truncate -s 1000000 file
> 	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
> 	od -tx1 file  # shows uninitialized disk blocks
> 
> Until a proper design for non-overwrite DIO writes on f2fs can be
> designed and implemented, remove support for them and make them fall
> back to buffered I/O.  This is what other filesystems that don't support
> unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
> writes.  However, f2fs can't do extending DIO writes either, as f2fs
> appears to have no mechanism for guaranteeing that leftover allocated
> blocks past EOF will get truncated.  (f2fs does have an orphan list, but
> it's only used for deleting inodes, not truncating them.)
> 
> This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
> cases in f2fs_map_blocks(); that can be cleaned up later.
> 
> Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
> Cc: stable@vger.kernel.org
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---

Any opinion on this patch?  This really needs to be fixed one way or another.
Probably before the conversion to iomap, as this fix will need to be backported.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-30 19:17   ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-30 19:17 UTC (permalink / raw)
  To: linux-f2fs-devel, Jaegeuk Kim, Chao Yu; +Cc: linux-fsdevel, stable

On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> they require preallocating blocks, but f2fs doesn't support unwritten
> blocks and therefore has to preallocate the blocks as regular blocks.
> f2fs has no way to reliably roll back such preallocations, so as a
> result, f2fs will leak uninitialized blocks to users if a DIO write
> doesn't fully complete.  This can be easily reproduced by issuing a DIO
> write that will fail due to misalignment, e.g.:
> 
> 	rm -f file
> 	truncate -s 1000000 file
> 	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
> 	od -tx1 file  # shows uninitialized disk blocks
> 
> Until a proper design for non-overwrite DIO writes on f2fs can be
> designed and implemented, remove support for them and make them fall
> back to buffered I/O.  This is what other filesystems that don't support
> unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
> writes.  However, f2fs can't do extending DIO writes either, as f2fs
> appears to have no mechanism for guaranteeing that leftover allocated
> blocks past EOF will get truncated.  (f2fs does have an orphan list, but
> it's only used for deleting inodes, not truncating them.)
> 
> This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
> cases in f2fs_map_blocks(); that can be cleaned up later.
> 
> Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
> Cc: stable@vger.kernel.org
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---

Any opinion on this patch?  This really needs to be fixed one way or another.
Probably before the conversion to iomap, as this fix will need to be backported.

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-30 19:17   ` [f2fs-dev] " Eric Biggers
@ 2021-07-30 22:12     ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-07-30 22:12 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-f2fs-devel, Chao Yu, linux-fsdevel, stable

On 07/30, Eric Biggers wrote:
> On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > they require preallocating blocks, but f2fs doesn't support unwritten
> > blocks and therefore has to preallocate the blocks as regular blocks.
> > f2fs has no way to reliably roll back such preallocations, so as a

Hmm, I'm still wondering why this becomes a problem. And, do we really need
to roll back the preallocated blocks?

> > result, f2fs will leak uninitialized blocks to users if a DIO write
> > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > write that will fail due to misalignment, e.g.:

If there's any error, truncating blocks having NEW_ADDR could address this?

> > 
> > 	rm -f file
> > 	truncate -s 1000000 file
> > 	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
> > 	od -tx1 file  # shows uninitialized disk blocks
> > 
> > Until a proper design for non-overwrite DIO writes on f2fs can be
> > designed and implemented, remove support for them and make them fall
> > back to buffered I/O.  This is what other filesystems that don't support
> > unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
> > writes.  However, f2fs can't do extending DIO writes either, as f2fs
> > appears to have no mechanism for guaranteeing that leftover allocated
> > blocks past EOF will get truncated.  (f2fs does have an orphan list, but
> > it's only used for deleting inodes, not truncating them.)
> > 
> > This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
> > cases in f2fs_map_blocks(); that can be cleaned up later.
> > 
> > Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > ---
> 
> Any opinion on this patch?  This really needs to be fixed one way or another.
> Probably before the conversion to iomap, as this fix will need to be backported.
> 
> - Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-30 22:12     ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-07-30 22:12 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-fsdevel, stable, linux-f2fs-devel

On 07/30, Eric Biggers wrote:
> On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > they require preallocating blocks, but f2fs doesn't support unwritten
> > blocks and therefore has to preallocate the blocks as regular blocks.
> > f2fs has no way to reliably roll back such preallocations, so as a

Hmm, I'm still wondering why this becomes a problem. And, do we really need
to roll back the preallocated blocks?

> > result, f2fs will leak uninitialized blocks to users if a DIO write
> > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > write that will fail due to misalignment, e.g.:

If there's any error, truncating blocks having NEW_ADDR could address this?

> > 
> > 	rm -f file
> > 	truncate -s 1000000 file
> > 	dd if=/dev/zero bs=999999 oflag=direct conv=notrunc of=file
> > 	od -tx1 file  # shows uninitialized disk blocks
> > 
> > Until a proper design for non-overwrite DIO writes on f2fs can be
> > designed and implemented, remove support for them and make them fall
> > back to buffered I/O.  This is what other filesystems that don't support
> > unwritten blocks, e.g. ext2, also do, at least for non-extending DIO
> > writes.  However, f2fs can't do extending DIO writes either, as f2fs
> > appears to have no mechanism for guaranteeing that leftover allocated
> > blocks past EOF will get truncated.  (f2fs does have an orphan list, but
> > it's only used for deleting inodes, not truncating them.)
> > 
> > This patch doesn't attempt to remove the F2FS_GET_BLOCK_{DIO,PRE_DIO}
> > cases in f2fs_map_blocks(); that can be cleaned up later.
> > 
> > Fixes: bfad7c2d4033 ("f2fs: introduce a new direct_IO write path")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > ---
> 
> Any opinion on this patch?  This really needs to be fixed one way or another.
> Probably before the conversion to iomap, as this fix will need to be backported.
> 
> - Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-30 22:12     ` [f2fs-dev] " Jaegeuk Kim
@ 2021-07-30 22:19       ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-30 22:19 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-f2fs-devel, Chao Yu, linux-fsdevel, stable

On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> On 07/30, Eric Biggers wrote:
> > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > From: Eric Biggers <ebiggers@google.com>
> > > 
> > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > f2fs has no way to reliably roll back such preallocations, so as a
> 
> Hmm, I'm still wondering why this becomes a problem. And, do we really need
> to roll back the preallocated blocks?
> 
> > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > write that will fail due to misalignment, e.g.:
> 
> If there's any error, truncating blocks having NEW_ADDR could address this?
> 

My understanding is that the "NEW_ADDR" block address in f2fs means that space
was reserved for the block, but not allocated in any particular place yet.
Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
definition has to directly write to a specific on-disk location.  Therefore DIO
writes require that the blocks be preallocated for real.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-30 22:19       ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-30 22:19 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-fsdevel, stable, linux-f2fs-devel

On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> On 07/30, Eric Biggers wrote:
> > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > From: Eric Biggers <ebiggers@google.com>
> > > 
> > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > f2fs has no way to reliably roll back such preallocations, so as a
> 
> Hmm, I'm still wondering why this becomes a problem. And, do we really need
> to roll back the preallocated blocks?
> 
> > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > write that will fail due to misalignment, e.g.:
> 
> If there's any error, truncating blocks having NEW_ADDR could address this?
> 

My understanding is that the "NEW_ADDR" block address in f2fs means that space
was reserved for the block, but not allocated in any particular place yet.
Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
definition has to directly write to a specific on-disk location.  Therefore DIO
writes require that the blocks be preallocated for real.

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-30 22:19       ` [f2fs-dev] " Eric Biggers
@ 2021-07-31  1:05         ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-07-31  1:05 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-f2fs-devel, Chao Yu, linux-fsdevel, stable

On 07/30, Eric Biggers wrote:
> On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> > On 07/30, Eric Biggers wrote:
> > > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > > From: Eric Biggers <ebiggers@google.com>
> > > > 
> > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > f2fs has no way to reliably roll back such preallocations, so as a
> > 
> > Hmm, I'm still wondering why this becomes a problem. And, do we really need
> > to roll back the preallocated blocks?
> > 
> > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > > write that will fail due to misalignment, e.g.:
> > 
> > If there's any error, truncating blocks having NEW_ADDR could address this?
> > 
> 
> My understanding is that the "NEW_ADDR" block address in f2fs means that space
> was reserved for the block, but not allocated in any particular place yet.
> Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
> definition has to directly write to a specific on-disk location.  Therefore DIO
> writes require that the blocks be preallocated for real.

Sorry, checking back the DIO flow, we do allocate real block addresses if DIO
has holes.

f2fs_preallocate_blocks
 -> f2fs_map_blocks(F2FS_GET_BLOCK_PRE_DIO)
  -> __allocate_data_block()
   -> f2fs_allocate_data_block() gets a free LBA

Then, back to your concern, do we need to truncate blocks beyond i_size, if we
meet any failure?

> 
> - Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-31  1:05         ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-07-31  1:05 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-fsdevel, stable, linux-f2fs-devel

On 07/30, Eric Biggers wrote:
> On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> > On 07/30, Eric Biggers wrote:
> > > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > > From: Eric Biggers <ebiggers@google.com>
> > > > 
> > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > f2fs has no way to reliably roll back such preallocations, so as a
> > 
> > Hmm, I'm still wondering why this becomes a problem. And, do we really need
> > to roll back the preallocated blocks?
> > 
> > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > > write that will fail due to misalignment, e.g.:
> > 
> > If there's any error, truncating blocks having NEW_ADDR could address this?
> > 
> 
> My understanding is that the "NEW_ADDR" block address in f2fs means that space
> was reserved for the block, but not allocated in any particular place yet.
> Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
> definition has to directly write to a specific on-disk location.  Therefore DIO
> writes require that the blocks be preallocated for real.

Sorry, checking back the DIO flow, we do allocate real block addresses if DIO
has holes.

f2fs_preallocate_blocks
 -> f2fs_map_blocks(F2FS_GET_BLOCK_PRE_DIO)
  -> __allocate_data_block()
   -> f2fs_allocate_data_block() gets a free LBA

Then, back to your concern, do we need to truncate blocks beyond i_size, if we
meet any failure?

> 
> - Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-31  1:05         ` [f2fs-dev] " Jaegeuk Kim
@ 2021-07-31  1:18           ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-31  1:18 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-f2fs-devel, Chao Yu, linux-fsdevel, stable

On Fri, Jul 30, 2021 at 06:05:59PM -0700, Jaegeuk Kim wrote:
> On 07/30, Eric Biggers wrote:
> > On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> > > On 07/30, Eric Biggers wrote:
> > > > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > > > From: Eric Biggers <ebiggers@google.com>
> > > > > 
> > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > 
> > > Hmm, I'm still wondering why this becomes a problem. And, do we really need
> > > to roll back the preallocated blocks?
> > > 
> > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > > > write that will fail due to misalignment, e.g.:
> > > 
> > > If there's any error, truncating blocks having NEW_ADDR could address this?
> > > 
> > 
> > My understanding is that the "NEW_ADDR" block address in f2fs means that space
> > was reserved for the block, but not allocated in any particular place yet.
> > Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
> > definition has to directly write to a specific on-disk location.  Therefore DIO
> > writes require that the blocks be preallocated for real.
> 
> Sorry, checking back the DIO flow, we do allocate real block addresses if DIO
> has holes.
> 
> f2fs_preallocate_blocks
>  -> f2fs_map_blocks(F2FS_GET_BLOCK_PRE_DIO)
>   -> __allocate_data_block()
>    -> f2fs_allocate_data_block() gets a free LBA
> 
> Then, back to your concern, do we need to truncate blocks beyond i_size, if we
> meet any failure?

That isn't enough because an allocating write is not necessarily an extending
write; it may be filling holes.  Also to be power-fail safe, the preallocations
must not be committed to disk at all until the write has completed (maybe that's
already the case in f2fs, but it's not clear to me).

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-31  1:18           ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-07-31  1:18 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-fsdevel, stable, linux-f2fs-devel

On Fri, Jul 30, 2021 at 06:05:59PM -0700, Jaegeuk Kim wrote:
> On 07/30, Eric Biggers wrote:
> > On Fri, Jul 30, 2021 at 03:12:15PM -0700, Jaegeuk Kim wrote:
> > > On 07/30, Eric Biggers wrote:
> > > > On Tue, Jul 27, 2021 at 06:51:54PM -0700, Eric Biggers wrote:
> > > > > From: Eric Biggers <ebiggers@google.com>
> > > > > 
> > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > 
> > > Hmm, I'm still wondering why this becomes a problem. And, do we really need
> > > to roll back the preallocated blocks?
> > > 
> > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > doesn't fully complete.  This can be easily reproduced by issuing a DIO
> > > > > write that will fail due to misalignment, e.g.:
> > > 
> > > If there's any error, truncating blocks having NEW_ADDR could address this?
> > > 
> > 
> > My understanding is that the "NEW_ADDR" block address in f2fs means that space
> > was reserved for the block, but not allocated in any particular place yet.
> > Buffered writes reserve blocks in this way, but DIO writes cannot because DIO by
> > definition has to directly write to a specific on-disk location.  Therefore DIO
> > writes require that the blocks be preallocated for real.
> 
> Sorry, checking back the DIO flow, we do allocate real block addresses if DIO
> has holes.
> 
> f2fs_preallocate_blocks
>  -> f2fs_map_blocks(F2FS_GET_BLOCK_PRE_DIO)
>   -> __allocate_data_block()
>    -> f2fs_allocate_data_block() gets a free LBA
> 
> Then, back to your concern, do we need to truncate blocks beyond i_size, if we
> meet any failure?

That isn't enough because an allocating write is not necessarily an extending
write; it may be filling holes.  Also to be power-fail safe, the preallocations
must not be committed to disk at all until the write has completed (maybe that's
already the case in f2fs, but it's not clear to me).

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-30 19:17   ` [f2fs-dev] " Eric Biggers
@ 2021-07-31  2:46     ` Theodore Ts'o
  -1 siblings, 0 replies; 42+ messages in thread
From: Theodore Ts'o @ 2021-07-31  2:46 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-f2fs-devel, Jaegeuk Kim, Chao Yu, linux-fsdevel, stable

On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > they require preallocating blocks, but f2fs doesn't support unwritten
> > blocks and therefore has to preallocate the blocks as regular blocks.
> > f2fs has no way to reliably roll back such preallocations, so as a
> > result, f2fs will leak uninitialized blocks to users if a DIO write
> > doesn't fully complete.

There's another way of solving this problem which doesn't require
supporting unwritten blocks.  What a file system *could* do is to
allocate the blocks, but *not* update the on-disk data structures ---
so the allocation happens in memory only, so you know that the
physical blocks won't get used for another files, and then issue the
data block writes.  On the block I/O completion, trigger a workqueue
function which updates the on-disk metadata to assign physical blocks
to the inode.

That way if you crash before the data I/O has a chance to complete,
the on-disk logical block -> physical block map hasn't been updated
yet, and so you don't need to worry about leaking uninitialized blocks.

Cheers,

					- Ted

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-07-31  2:46     ` Theodore Ts'o
  0 siblings, 0 replies; 42+ messages in thread
From: Theodore Ts'o @ 2021-07-31  2:46 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-fsdevel, Jaegeuk Kim, stable, linux-f2fs-devel

On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > they require preallocating blocks, but f2fs doesn't support unwritten
> > blocks and therefore has to preallocate the blocks as regular blocks.
> > f2fs has no way to reliably roll back such preallocations, so as a
> > result, f2fs will leak uninitialized blocks to users if a DIO write
> > doesn't fully complete.

There's another way of solving this problem which doesn't require
supporting unwritten blocks.  What a file system *could* do is to
allocate the blocks, but *not* update the on-disk data structures ---
so the allocation happens in memory only, so you know that the
physical blocks won't get used for another files, and then issue the
data block writes.  On the block I/O completion, trigger a workqueue
function which updates the on-disk metadata to assign physical blocks
to the inode.

That way if you crash before the data I/O has a chance to complete,
the on-disk logical block -> physical block map hasn't been updated
yet, and so you don't need to worry about leaking uninitialized blocks.

Cheers,

					- Ted


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-07-31  2:46     ` [f2fs-dev] " Theodore Ts'o
@ 2021-08-02  4:39       ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-02  4:39 UTC (permalink / raw)
  To: Jaegeuk Kim, Chao Yu
  Cc: linux-f2fs-devel, Theodore Ts'o, linux-fsdevel, stable

On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > f2fs has no way to reliably roll back such preallocations, so as a
> > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > doesn't fully complete.
> 
> There's another way of solving this problem which doesn't require
> supporting unwritten blocks.  What a file system *could* do is to
> allocate the blocks, but *not* update the on-disk data structures ---
> so the allocation happens in memory only, so you know that the
> physical blocks won't get used for another files, and then issue the
> data block writes.  On the block I/O completion, trigger a workqueue
> function which updates the on-disk metadata to assign physical blocks
> to the inode.
> 
> That way if you crash before the data I/O has a chance to complete,
> the on-disk logical block -> physical block map hasn't been updated
> yet, and so you don't need to worry about leaking uninitialized blocks.
> 
> Cheers,
> 
> 					- Ted

Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-02  4:39       ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-02  4:39 UTC (permalink / raw)
  To: Jaegeuk Kim, Chao Yu
  Cc: linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > f2fs has no way to reliably roll back such preallocations, so as a
> > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > doesn't fully complete.
> 
> There's another way of solving this problem which doesn't require
> supporting unwritten blocks.  What a file system *could* do is to
> allocate the blocks, but *not* update the on-disk data structures ---
> so the allocation happens in memory only, so you know that the
> physical blocks won't get used for another files, and then issue the
> data block writes.  On the block I/O completion, trigger a workqueue
> function which updates the on-disk metadata to assign physical blocks
> to the inode.
> 
> That way if you crash before the data I/O has a chance to complete,
> the on-disk logical block -> physical block map hasn't been updated
> yet, and so you don't need to worry about leaking uninitialized blocks.
> 
> Cheers,
> 
> 					- Ted

Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-02  4:39       ` [f2fs-dev] " Eric Biggers
@ 2021-08-02  9:00         ` Chao Yu
  -1 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-02  9:00 UTC (permalink / raw)
  To: Eric Biggers, Jaegeuk Kim, Theodore Ts'o
  Cc: linux-f2fs-devel, linux-fsdevel, stable

On 2021/8/2 12:39, Eric Biggers wrote:
> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>> doesn't fully complete.
>>
>> There's another way of solving this problem which doesn't require
>> supporting unwritten blocks.  What a file system *could* do is to
>> allocate the blocks, but *not* update the on-disk data structures ---
>> so the allocation happens in memory only, so you know that the
>> physical blocks won't get used for another files, and then issue the
>> data block writes.  On the block I/O completion, trigger a workqueue
>> function which updates the on-disk metadata to assign physical blocks
>> to the inode.
>>
>> That way if you crash before the data I/O has a chance to complete,
>> the on-disk logical block -> physical block map hasn't been updated
>> yet, and so you don't need to worry about leaking uninitialized blocks.

Thanks for your suggestion, I think it makes sense.

>>
>> Cheers,
>>
>> 					- Ted
> 
> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?

Firstly, let's notice that below metadata will be touched during DIO
preallocation flow:
- log header
- sit bitmap/count
- free seg/sec bitmap/count
- dirty seg/sec bitmap/count

And there is one case we need to concern about is: checkpoint() can be
triggered randomly in between dio_preallocate() and dio_end_io(), we should
not persist any DIO preallocation related metadata during checkpoint(),
otherwise, sudden power-cut after the checkpoint will corrupt filesytem.

So it needs to well separate two kinds of metadata update:
a) belong to dio preallocation
b) the left one

After that, it will simply checkpoint() flow to just flush metadata b), for
other flow, like GC, data/node allocation, it needs to query/update metadata
after we combine metadata a) and b).

In addition, there is an existing in-memory log header framework in f2fs,
based on this fwk, it's very easy for us to add a new in-memory log header
for DIO preallocation.

So it seems feasible for me until now...

Jaegeuk, any other concerns about the implementation details?

Thanks,

> 
> - Eric
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-02  9:00         ` Chao Yu
  0 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-02  9:00 UTC (permalink / raw)
  To: Eric Biggers, Jaegeuk Kim, Theodore Ts'o
  Cc: linux-fsdevel, stable, linux-f2fs-devel

On 2021/8/2 12:39, Eric Biggers wrote:
> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>> doesn't fully complete.
>>
>> There's another way of solving this problem which doesn't require
>> supporting unwritten blocks.  What a file system *could* do is to
>> allocate the blocks, but *not* update the on-disk data structures ---
>> so the allocation happens in memory only, so you know that the
>> physical blocks won't get used for another files, and then issue the
>> data block writes.  On the block I/O completion, trigger a workqueue
>> function which updates the on-disk metadata to assign physical blocks
>> to the inode.
>>
>> That way if you crash before the data I/O has a chance to complete,
>> the on-disk logical block -> physical block map hasn't been updated
>> yet, and so you don't need to worry about leaking uninitialized blocks.

Thanks for your suggestion, I think it makes sense.

>>
>> Cheers,
>>
>> 					- Ted
> 
> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?

Firstly, let's notice that below metadata will be touched during DIO
preallocation flow:
- log header
- sit bitmap/count
- free seg/sec bitmap/count
- dirty seg/sec bitmap/count

And there is one case we need to concern about is: checkpoint() can be
triggered randomly in between dio_preallocate() and dio_end_io(), we should
not persist any DIO preallocation related metadata during checkpoint(),
otherwise, sudden power-cut after the checkpoint will corrupt filesytem.

So it needs to well separate two kinds of metadata update:
a) belong to dio preallocation
b) the left one

After that, it will simply checkpoint() flow to just flush metadata b), for
other flow, like GC, data/node allocation, it needs to query/update metadata
after we combine metadata a) and b).

In addition, there is an existing in-memory log header framework in f2fs,
based on this fwk, it's very easy for us to add a new in-memory log header
for DIO preallocation.

So it seems feasible for me until now...

Jaegeuk, any other concerns about the implementation details?

Thanks,

> 
> - Eric
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-02  9:00         ` [f2fs-dev] " Chao Yu
@ 2021-08-02 18:23           ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-02 18:23 UTC (permalink / raw)
  To: Chao Yu
  Cc: Eric Biggers, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On 08/02, Chao Yu wrote:
> On 2021/8/2 12:39, Eric Biggers wrote:
> > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > doesn't fully complete.
> > > 
> > > There's another way of solving this problem which doesn't require
> > > supporting unwritten blocks.  What a file system *could* do is to
> > > allocate the blocks, but *not* update the on-disk data structures ---
> > > so the allocation happens in memory only, so you know that the
> > > physical blocks won't get used for another files, and then issue the
> > > data block writes.  On the block I/O completion, trigger a workqueue
> > > function which updates the on-disk metadata to assign physical blocks
> > > to the inode.
> > > 
> > > That way if you crash before the data I/O has a chance to complete,
> > > the on-disk logical block -> physical block map hasn't been updated
> > > yet, and so you don't need to worry about leaking uninitialized blocks.
> 
> Thanks for your suggestion, I think it makes sense.
> 
> > > 
> > > Cheers,
> > > 
> > > 					- Ted
> > 
> > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> 
> Firstly, let's notice that below metadata will be touched during DIO
> preallocation flow:
> - log header
> - sit bitmap/count
> - free seg/sec bitmap/count
> - dirty seg/sec bitmap/count
> 
> And there is one case we need to concern about is: checkpoint() can be
> triggered randomly in between dio_preallocate() and dio_end_io(), we should
> not persist any DIO preallocation related metadata during checkpoint(),
> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> 
> So it needs to well separate two kinds of metadata update:
> a) belong to dio preallocation
> b) the left one
> 
> After that, it will simply checkpoint() flow to just flush metadata b), for
> other flow, like GC, data/node allocation, it needs to query/update metadata
> after we combine metadata a) and b).
> 
> In addition, there is an existing in-memory log header framework in f2fs,
> based on this fwk, it's very easy for us to add a new in-memory log header
> for DIO preallocation.
> 
> So it seems feasible for me until now...
> 
> Jaegeuk, any other concerns about the implementation details?

Hmm, I'm still trying to deal with this as a corner case where the writes
haven't completed due to an error. How about keeping the preallocated block
offsets and releasing them if we get an error? Do we need to handle EIO right?

> 
> Thanks,
> 
> > 
> > - Eric
> > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-02 18:23           ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-02 18:23 UTC (permalink / raw)
  To: Chao Yu
  Cc: Eric Biggers, linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On 08/02, Chao Yu wrote:
> On 2021/8/2 12:39, Eric Biggers wrote:
> > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > doesn't fully complete.
> > > 
> > > There's another way of solving this problem which doesn't require
> > > supporting unwritten blocks.  What a file system *could* do is to
> > > allocate the blocks, but *not* update the on-disk data structures ---
> > > so the allocation happens in memory only, so you know that the
> > > physical blocks won't get used for another files, and then issue the
> > > data block writes.  On the block I/O completion, trigger a workqueue
> > > function which updates the on-disk metadata to assign physical blocks
> > > to the inode.
> > > 
> > > That way if you crash before the data I/O has a chance to complete,
> > > the on-disk logical block -> physical block map hasn't been updated
> > > yet, and so you don't need to worry about leaking uninitialized blocks.
> 
> Thanks for your suggestion, I think it makes sense.
> 
> > > 
> > > Cheers,
> > > 
> > > 					- Ted
> > 
> > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> 
> Firstly, let's notice that below metadata will be touched during DIO
> preallocation flow:
> - log header
> - sit bitmap/count
> - free seg/sec bitmap/count
> - dirty seg/sec bitmap/count
> 
> And there is one case we need to concern about is: checkpoint() can be
> triggered randomly in between dio_preallocate() and dio_end_io(), we should
> not persist any DIO preallocation related metadata during checkpoint(),
> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> 
> So it needs to well separate two kinds of metadata update:
> a) belong to dio preallocation
> b) the left one
> 
> After that, it will simply checkpoint() flow to just flush metadata b), for
> other flow, like GC, data/node allocation, it needs to query/update metadata
> after we combine metadata a) and b).
> 
> In addition, there is an existing in-memory log header framework in f2fs,
> based on this fwk, it's very easy for us to add a new in-memory log header
> for DIO preallocation.
> 
> So it seems feasible for me until now...
> 
> Jaegeuk, any other concerns about the implementation details?

Hmm, I'm still trying to deal with this as a corner case where the writes
haven't completed due to an error. How about keeping the preallocated block
offsets and releasing them if we get an error? Do we need to handle EIO right?

> 
> Thanks,
> 
> > 
> > - Eric
> > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-02 18:23           ` [f2fs-dev] " Jaegeuk Kim
@ 2021-08-03  1:19             ` Chao Yu
  -1 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-03  1:19 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Eric Biggers, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On 2021/8/3 2:23, Jaegeuk Kim wrote:
> On 08/02, Chao Yu wrote:
>> On 2021/8/2 12:39, Eric Biggers wrote:
>>> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>>>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>>>> doesn't fully complete.
>>>>
>>>> There's another way of solving this problem which doesn't require
>>>> supporting unwritten blocks.  What a file system *could* do is to
>>>> allocate the blocks, but *not* update the on-disk data structures ---
>>>> so the allocation happens in memory only, so you know that the
>>>> physical blocks won't get used for another files, and then issue the
>>>> data block writes.  On the block I/O completion, trigger a workqueue
>>>> function which updates the on-disk metadata to assign physical blocks
>>>> to the inode.
>>>>
>>>> That way if you crash before the data I/O has a chance to complete,
>>>> the on-disk logical block -> physical block map hasn't been updated
>>>> yet, and so you don't need to worry about leaking uninitialized blocks.
>>
>> Thanks for your suggestion, I think it makes sense.
>>
>>>>
>>>> Cheers,
>>>>
>>>> 					- Ted
>>>
>>> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
>>
>> Firstly, let's notice that below metadata will be touched during DIO
>> preallocation flow:
>> - log header
>> - sit bitmap/count
>> - free seg/sec bitmap/count
>> - dirty seg/sec bitmap/count
>>
>> And there is one case we need to concern about is: checkpoint() can be
>> triggered randomly in between dio_preallocate() and dio_end_io(), we should
>> not persist any DIO preallocation related metadata during checkpoint(),
>> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
>>
>> So it needs to well separate two kinds of metadata update:
>> a) belong to dio preallocation
>> b) the left one
>>
>> After that, it will simply checkpoint() flow to just flush metadata b), for
>> other flow, like GC, data/node allocation, it needs to query/update metadata
>> after we combine metadata a) and b).
>>
>> In addition, there is an existing in-memory log header framework in f2fs,
>> based on this fwk, it's very easy for us to add a new in-memory log header
>> for DIO preallocation.
>>
>> So it seems feasible for me until now...
>>
>> Jaegeuk, any other concerns about the implementation details?
> 
> Hmm, I'm still trying to deal with this as a corner case where the writes
> haven't completed due to an error. How about keeping the preallocated block
> offsets and releasing them if we get an error? Do we need to handle EIO right?

What about the case that CP + SPO following DIO preallocation? User will
encounter uninitialized block after recovery.

Thanks,

> 
>>
>> Thanks,
>>
>>>
>>> - Eric
>>>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-03  1:19             ` Chao Yu
  0 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-03  1:19 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Eric Biggers, linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On 2021/8/3 2:23, Jaegeuk Kim wrote:
> On 08/02, Chao Yu wrote:
>> On 2021/8/2 12:39, Eric Biggers wrote:
>>> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>>>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>>>> doesn't fully complete.
>>>>
>>>> There's another way of solving this problem which doesn't require
>>>> supporting unwritten blocks.  What a file system *could* do is to
>>>> allocate the blocks, but *not* update the on-disk data structures ---
>>>> so the allocation happens in memory only, so you know that the
>>>> physical blocks won't get used for another files, and then issue the
>>>> data block writes.  On the block I/O completion, trigger a workqueue
>>>> function which updates the on-disk metadata to assign physical blocks
>>>> to the inode.
>>>>
>>>> That way if you crash before the data I/O has a chance to complete,
>>>> the on-disk logical block -> physical block map hasn't been updated
>>>> yet, and so you don't need to worry about leaking uninitialized blocks.
>>
>> Thanks for your suggestion, I think it makes sense.
>>
>>>>
>>>> Cheers,
>>>>
>>>> 					- Ted
>>>
>>> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
>>
>> Firstly, let's notice that below metadata will be touched during DIO
>> preallocation flow:
>> - log header
>> - sit bitmap/count
>> - free seg/sec bitmap/count
>> - dirty seg/sec bitmap/count
>>
>> And there is one case we need to concern about is: checkpoint() can be
>> triggered randomly in between dio_preallocate() and dio_end_io(), we should
>> not persist any DIO preallocation related metadata during checkpoint(),
>> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
>>
>> So it needs to well separate two kinds of metadata update:
>> a) belong to dio preallocation
>> b) the left one
>>
>> After that, it will simply checkpoint() flow to just flush metadata b), for
>> other flow, like GC, data/node allocation, it needs to query/update metadata
>> after we combine metadata a) and b).
>>
>> In addition, there is an existing in-memory log header framework in f2fs,
>> based on this fwk, it's very easy for us to add a new in-memory log header
>> for DIO preallocation.
>>
>> So it seems feasible for me until now...
>>
>> Jaegeuk, any other concerns about the implementation details?
> 
> Hmm, I'm still trying to deal with this as a corner case where the writes
> haven't completed due to an error. How about keeping the preallocated block
> offsets and releasing them if we get an error? Do we need to handle EIO right?

What about the case that CP + SPO following DIO preallocation? User will
encounter uninitialized block after recovery.

Thanks,

> 
>>
>> Thanks,
>>
>>>
>>> - Eric
>>>


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-03  1:19             ` [f2fs-dev] " Chao Yu
@ 2021-08-03  1:34               ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-03  1:34 UTC (permalink / raw)
  To: Chao Yu
  Cc: Eric Biggers, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On 08/03, Chao Yu wrote:
> On 2021/8/3 2:23, Jaegeuk Kim wrote:
> > On 08/02, Chao Yu wrote:
> > > On 2021/8/2 12:39, Eric Biggers wrote:
> > > > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > > > doesn't fully complete.
> > > > > 
> > > > > There's another way of solving this problem which doesn't require
> > > > > supporting unwritten blocks.  What a file system *could* do is to
> > > > > allocate the blocks, but *not* update the on-disk data structures ---
> > > > > so the allocation happens in memory only, so you know that the
> > > > > physical blocks won't get used for another files, and then issue the
> > > > > data block writes.  On the block I/O completion, trigger a workqueue
> > > > > function which updates the on-disk metadata to assign physical blocks
> > > > > to the inode.
> > > > > 
> > > > > That way if you crash before the data I/O has a chance to complete,
> > > > > the on-disk logical block -> physical block map hasn't been updated
> > > > > yet, and so you don't need to worry about leaking uninitialized blocks.
> > > 
> > > Thanks for your suggestion, I think it makes sense.
> > > 
> > > > > 
> > > > > Cheers,
> > > > > 
> > > > > 					- Ted
> > > > 
> > > > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> > > 
> > > Firstly, let's notice that below metadata will be touched during DIO
> > > preallocation flow:
> > > - log header
> > > - sit bitmap/count
> > > - free seg/sec bitmap/count
> > > - dirty seg/sec bitmap/count
> > > 
> > > And there is one case we need to concern about is: checkpoint() can be
> > > triggered randomly in between dio_preallocate() and dio_end_io(), we should
> > > not persist any DIO preallocation related metadata during checkpoint(),
> > > otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> > > 
> > > So it needs to well separate two kinds of metadata update:
> > > a) belong to dio preallocation
> > > b) the left one
> > > 
> > > After that, it will simply checkpoint() flow to just flush metadata b), for
> > > other flow, like GC, data/node allocation, it needs to query/update metadata
> > > after we combine metadata a) and b).
> > > 
> > > In addition, there is an existing in-memory log header framework in f2fs,
> > > based on this fwk, it's very easy for us to add a new in-memory log header
> > > for DIO preallocation.
> > > 
> > > So it seems feasible for me until now...
> > > 
> > > Jaegeuk, any other concerns about the implementation details?
> > 
> > Hmm, I'm still trying to deal with this as a corner case where the writes
> > haven't completed due to an error. How about keeping the preallocated block
> > offsets and releasing them if we get an error? Do we need to handle EIO right?
> 
> What about the case that CP + SPO following DIO preallocation? User will
> encounter uninitialized block after recovery.

I think buffered writes as a workaround can expose the last unwritten block as
well, if SPO happens right after block allocation. We may need to compromise
at certain level?

> 
> Thanks,
> 
> > 
> > > 
> > > Thanks,
> > > 
> > > > 
> > > > - Eric
> > > > 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-03  1:34               ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-03  1:34 UTC (permalink / raw)
  To: Chao Yu
  Cc: Eric Biggers, linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On 08/03, Chao Yu wrote:
> On 2021/8/3 2:23, Jaegeuk Kim wrote:
> > On 08/02, Chao Yu wrote:
> > > On 2021/8/2 12:39, Eric Biggers wrote:
> > > > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > > > doesn't fully complete.
> > > > > 
> > > > > There's another way of solving this problem which doesn't require
> > > > > supporting unwritten blocks.  What a file system *could* do is to
> > > > > allocate the blocks, but *not* update the on-disk data structures ---
> > > > > so the allocation happens in memory only, so you know that the
> > > > > physical blocks won't get used for another files, and then issue the
> > > > > data block writes.  On the block I/O completion, trigger a workqueue
> > > > > function which updates the on-disk metadata to assign physical blocks
> > > > > to the inode.
> > > > > 
> > > > > That way if you crash before the data I/O has a chance to complete,
> > > > > the on-disk logical block -> physical block map hasn't been updated
> > > > > yet, and so you don't need to worry about leaking uninitialized blocks.
> > > 
> > > Thanks for your suggestion, I think it makes sense.
> > > 
> > > > > 
> > > > > Cheers,
> > > > > 
> > > > > 					- Ted
> > > > 
> > > > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> > > 
> > > Firstly, let's notice that below metadata will be touched during DIO
> > > preallocation flow:
> > > - log header
> > > - sit bitmap/count
> > > - free seg/sec bitmap/count
> > > - dirty seg/sec bitmap/count
> > > 
> > > And there is one case we need to concern about is: checkpoint() can be
> > > triggered randomly in between dio_preallocate() and dio_end_io(), we should
> > > not persist any DIO preallocation related metadata during checkpoint(),
> > > otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> > > 
> > > So it needs to well separate two kinds of metadata update:
> > > a) belong to dio preallocation
> > > b) the left one
> > > 
> > > After that, it will simply checkpoint() flow to just flush metadata b), for
> > > other flow, like GC, data/node allocation, it needs to query/update metadata
> > > after we combine metadata a) and b).
> > > 
> > > In addition, there is an existing in-memory log header framework in f2fs,
> > > based on this fwk, it's very easy for us to add a new in-memory log header
> > > for DIO preallocation.
> > > 
> > > So it seems feasible for me until now...
> > > 
> > > Jaegeuk, any other concerns about the implementation details?
> > 
> > Hmm, I'm still trying to deal with this as a corner case where the writes
> > haven't completed due to an error. How about keeping the preallocated block
> > offsets and releasing them if we get an error? Do we need to handle EIO right?
> 
> What about the case that CP + SPO following DIO preallocation? User will
> encounter uninitialized block after recovery.

I think buffered writes as a workaround can expose the last unwritten block as
well, if SPO happens right after block allocation. We may need to compromise
at certain level?

> 
> Thanks,
> 
> > 
> > > 
> > > Thanks,
> > > 
> > > > 
> > > > - Eric
> > > > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-03  1:34               ` [f2fs-dev] " Jaegeuk Kim
@ 2021-08-17  2:03                 ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-17  2:03 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Chao Yu, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On Mon, Aug 02, 2021 at 06:34:48PM -0700, Jaegeuk Kim wrote:
> On 08/03, Chao Yu wrote:
> > On 2021/8/3 2:23, Jaegeuk Kim wrote:
> > > On 08/02, Chao Yu wrote:
> > > > On 2021/8/2 12:39, Eric Biggers wrote:
> > > > > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > > > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > > > > doesn't fully complete.
> > > > > > 
> > > > > > There's another way of solving this problem which doesn't require
> > > > > > supporting unwritten blocks.  What a file system *could* do is to
> > > > > > allocate the blocks, but *not* update the on-disk data structures ---
> > > > > > so the allocation happens in memory only, so you know that the
> > > > > > physical blocks won't get used for another files, and then issue the
> > > > > > data block writes.  On the block I/O completion, trigger a workqueue
> > > > > > function which updates the on-disk metadata to assign physical blocks
> > > > > > to the inode.
> > > > > > 
> > > > > > That way if you crash before the data I/O has a chance to complete,
> > > > > > the on-disk logical block -> physical block map hasn't been updated
> > > > > > yet, and so you don't need to worry about leaking uninitialized blocks.
> > > > 
> > > > Thanks for your suggestion, I think it makes sense.
> > > > 
> > > > > > 
> > > > > > Cheers,
> > > > > > 
> > > > > > 					- Ted
> > > > > 
> > > > > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> > > > 
> > > > Firstly, let's notice that below metadata will be touched during DIO
> > > > preallocation flow:
> > > > - log header
> > > > - sit bitmap/count
> > > > - free seg/sec bitmap/count
> > > > - dirty seg/sec bitmap/count
> > > > 
> > > > And there is one case we need to concern about is: checkpoint() can be
> > > > triggered randomly in between dio_preallocate() and dio_end_io(), we should
> > > > not persist any DIO preallocation related metadata during checkpoint(),
> > > > otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> > > > 
> > > > So it needs to well separate two kinds of metadata update:
> > > > a) belong to dio preallocation
> > > > b) the left one
> > > > 
> > > > After that, it will simply checkpoint() flow to just flush metadata b), for
> > > > other flow, like GC, data/node allocation, it needs to query/update metadata
> > > > after we combine metadata a) and b).
> > > > 
> > > > In addition, there is an existing in-memory log header framework in f2fs,
> > > > based on this fwk, it's very easy for us to add a new in-memory log header
> > > > for DIO preallocation.
> > > > 
> > > > So it seems feasible for me until now...
> > > > 
> > > > Jaegeuk, any other concerns about the implementation details?
> > > 
> > > Hmm, I'm still trying to deal with this as a corner case where the writes
> > > haven't completed due to an error. How about keeping the preallocated block
> > > offsets and releasing them if we get an error? Do we need to handle EIO right?
> > 
> > What about the case that CP + SPO following DIO preallocation? User will
> > encounter uninitialized block after recovery.
> 
> I think buffered writes as a workaround can expose the last unwritten block as
> well, if SPO happens right after block allocation. We may need to compromise
> at certain level?
> 

Freeing preallocated blocks on error would be better than nothing, although note
that the preallocated blocks may have filled an arbitrary sequence of holes --
so simply truncating past EOF would *not* be sufficient.

But really filesystems need to be designed to never expose uninitialized data,
even if I/O errors or a sudden power failure occurs.  It is unfortunate that
f2fs apparently wasn't designed with that goal in mind.

In any case, I don't think we can proceed with any other f2fs direct I/O
improvements until this data leakage bug can be solved one way or another.  If
my patch to remove support for allocating writes isn't acceptable and the
desired solution is going to require some more invasive f2fs surgery, are you or
Chao going to work on it?  I'm not sure there's much I can do here.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-17  2:03                 ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-17  2:03 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On Mon, Aug 02, 2021 at 06:34:48PM -0700, Jaegeuk Kim wrote:
> On 08/03, Chao Yu wrote:
> > On 2021/8/3 2:23, Jaegeuk Kim wrote:
> > > On 08/02, Chao Yu wrote:
> > > > On 2021/8/2 12:39, Eric Biggers wrote:
> > > > > On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
> > > > > > On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
> > > > > > > > Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
> > > > > > > > they require preallocating blocks, but f2fs doesn't support unwritten
> > > > > > > > blocks and therefore has to preallocate the blocks as regular blocks.
> > > > > > > > f2fs has no way to reliably roll back such preallocations, so as a
> > > > > > > > result, f2fs will leak uninitialized blocks to users if a DIO write
> > > > > > > > doesn't fully complete.
> > > > > > 
> > > > > > There's another way of solving this problem which doesn't require
> > > > > > supporting unwritten blocks.  What a file system *could* do is to
> > > > > > allocate the blocks, but *not* update the on-disk data structures ---
> > > > > > so the allocation happens in memory only, so you know that the
> > > > > > physical blocks won't get used for another files, and then issue the
> > > > > > data block writes.  On the block I/O completion, trigger a workqueue
> > > > > > function which updates the on-disk metadata to assign physical blocks
> > > > > > to the inode.
> > > > > > 
> > > > > > That way if you crash before the data I/O has a chance to complete,
> > > > > > the on-disk logical block -> physical block map hasn't been updated
> > > > > > yet, and so you don't need to worry about leaking uninitialized blocks.
> > > > 
> > > > Thanks for your suggestion, I think it makes sense.
> > > > 
> > > > > > 
> > > > > > Cheers,
> > > > > > 
> > > > > > 					- Ted
> > > > > 
> > > > > Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
> > > > 
> > > > Firstly, let's notice that below metadata will be touched during DIO
> > > > preallocation flow:
> > > > - log header
> > > > - sit bitmap/count
> > > > - free seg/sec bitmap/count
> > > > - dirty seg/sec bitmap/count
> > > > 
> > > > And there is one case we need to concern about is: checkpoint() can be
> > > > triggered randomly in between dio_preallocate() and dio_end_io(), we should
> > > > not persist any DIO preallocation related metadata during checkpoint(),
> > > > otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
> > > > 
> > > > So it needs to well separate two kinds of metadata update:
> > > > a) belong to dio preallocation
> > > > b) the left one
> > > > 
> > > > After that, it will simply checkpoint() flow to just flush metadata b), for
> > > > other flow, like GC, data/node allocation, it needs to query/update metadata
> > > > after we combine metadata a) and b).
> > > > 
> > > > In addition, there is an existing in-memory log header framework in f2fs,
> > > > based on this fwk, it's very easy for us to add a new in-memory log header
> > > > for DIO preallocation.
> > > > 
> > > > So it seems feasible for me until now...
> > > > 
> > > > Jaegeuk, any other concerns about the implementation details?
> > > 
> > > Hmm, I'm still trying to deal with this as a corner case where the writes
> > > haven't completed due to an error. How about keeping the preallocated block
> > > offsets and releasing them if we get an error? Do we need to handle EIO right?
> > 
> > What about the case that CP + SPO following DIO preallocation? User will
> > encounter uninitialized block after recovery.
> 
> I think buffered writes as a workaround can expose the last unwritten block as
> well, if SPO happens right after block allocation. We may need to compromise
> at certain level?
> 

Freeing preallocated blocks on error would be better than nothing, although note
that the preallocated blocks may have filled an arbitrary sequence of holes --
so simply truncating past EOF would *not* be sufficient.

But really filesystems need to be designed to never expose uninitialized data,
even if I/O errors or a sudden power failure occurs.  It is unfortunate that
f2fs apparently wasn't designed with that goal in mind.

In any case, I don't think we can proceed with any other f2fs direct I/O
improvements until this data leakage bug can be solved one way or another.  If
my patch to remove support for allocating writes isn't acceptable and the
desired solution is going to require some more invasive f2fs surgery, are you or
Chao going to work on it?  I'm not sure there's much I can do here.

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17  2:03                 ` [f2fs-dev] " Eric Biggers
@ 2021-08-17  5:42                   ` Christoph Hellwig
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-08-17  5:42 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Jaegeuk Kim, Chao Yu, Theodore Ts'o, linux-f2fs-devel,
	linux-fsdevel, stable

On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> Freeing preallocated blocks on error would be better than nothing, although note
> that the preallocated blocks may have filled an arbitrary sequence of holes --
> so simply truncating past EOF would *not* be sufficient.
> 
> But really filesystems need to be designed to never expose uninitialized data,
> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> f2fs apparently wasn't designed with that goal in mind.
> 
> In any case, I don't think we can proceed with any other f2fs direct I/O
> improvements until this data leakage bug can be solved one way or another.  If
> my patch to remove support for allocating writes isn't acceptable and the
> desired solution is going to require some more invasive f2fs surgery, are you or
> Chao going to work on it?  I'm not sure there's much I can do here.

Btw, this is generally a problem for buffered I/O as well, although the
window for exposing uninitialized blocks on a crash tends to be smaller.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-17  5:42                   ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-08-17  5:42 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Theodore Ts'o, stable, linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim

On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> Freeing preallocated blocks on error would be better than nothing, although note
> that the preallocated blocks may have filled an arbitrary sequence of holes --
> so simply truncating past EOF would *not* be sufficient.
> 
> But really filesystems need to be designed to never expose uninitialized data,
> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> f2fs apparently wasn't designed with that goal in mind.
> 
> In any case, I don't think we can proceed with any other f2fs direct I/O
> improvements until this data leakage bug can be solved one way or another.  If
> my patch to remove support for allocating writes isn't acceptable and the
> desired solution is going to require some more invasive f2fs surgery, are you or
> Chao going to work on it?  I'm not sure there's much I can do here.

Btw, this is generally a problem for buffered I/O as well, although the
window for exposing uninitialized blocks on a crash tends to be smaller.


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17  5:42                   ` [f2fs-dev] " Christoph Hellwig
@ 2021-08-17 18:57                     ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-17 18:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Eric Biggers, Chao Yu, Theodore Ts'o, linux-f2fs-devel,
	linux-fsdevel, stable

On 08/17, Christoph Hellwig wrote:
> On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > Freeing preallocated blocks on error would be better than nothing, although note
> > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > so simply truncating past EOF would *not* be sufficient.
> > 
> > But really filesystems need to be designed to never expose uninitialized data,
> > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > f2fs apparently wasn't designed with that goal in mind.
> > 
> > In any case, I don't think we can proceed with any other f2fs direct I/O
> > improvements until this data leakage bug can be solved one way or another.  If
> > my patch to remove support for allocating writes isn't acceptable and the
> > desired solution is going to require some more invasive f2fs surgery, are you or
> > Chao going to work on it?  I'm not sure there's much I can do here.
> 
> Btw, this is generally a problem for buffered I/O as well, although the
> window for exposing uninitialized blocks on a crash tends to be smaller.

How about adding a warning message when we meet an error with preallocated
unwritten blocks? In the meantime, can we get the Eric's patches for iomap
support? I feel that we only need to modify the preallocation and error
handling parts?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-17 18:57                     ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-17 18:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Theodore Ts'o, stable, linux-f2fs-devel, Eric Biggers, linux-fsdevel

On 08/17, Christoph Hellwig wrote:
> On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > Freeing preallocated blocks on error would be better than nothing, although note
> > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > so simply truncating past EOF would *not* be sufficient.
> > 
> > But really filesystems need to be designed to never expose uninitialized data,
> > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > f2fs apparently wasn't designed with that goal in mind.
> > 
> > In any case, I don't think we can proceed with any other f2fs direct I/O
> > improvements until this data leakage bug can be solved one way or another.  If
> > my patch to remove support for allocating writes isn't acceptable and the
> > desired solution is going to require some more invasive f2fs surgery, are you or
> > Chao going to work on it?  I'm not sure there's much I can do here.
> 
> Btw, this is generally a problem for buffered I/O as well, although the
> window for exposing uninitialized blocks on a crash tends to be smaller.

How about adding a warning message when we meet an error with preallocated
unwritten blocks? In the meantime, can we get the Eric's patches for iomap
support? I feel that we only need to modify the preallocation and error
handling parts?


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17 18:57                     ` [f2fs-dev] " Jaegeuk Kim
@ 2021-08-17 20:27                       ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-17 20:27 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Christoph Hellwig, Chao Yu, Theodore Ts'o, linux-f2fs-devel,
	linux-fsdevel, stable

On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> On 08/17, Christoph Hellwig wrote:
> > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > Freeing preallocated blocks on error would be better than nothing, although note
> > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > so simply truncating past EOF would *not* be sufficient.
> > > 
> > > But really filesystems need to be designed to never expose uninitialized data,
> > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > f2fs apparently wasn't designed with that goal in mind.
> > > 
> > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > improvements until this data leakage bug can be solved one way or another.  If
> > > my patch to remove support for allocating writes isn't acceptable and the
> > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > Chao going to work on it?  I'm not sure there's much I can do here.
> > 
> > Btw, this is generally a problem for buffered I/O as well, although the
> > window for exposing uninitialized blocks on a crash tends to be smaller.
> 
> How about adding a warning message when we meet an error with preallocated
> unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> support? I feel that we only need to modify the preallocation and error
> handling parts?

A warning message would do nothing to prevent uninitialized blocks from being
leaked to userspace.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-17 20:27                       ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-17 20:27 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Theodore Ts'o, stable, linux-f2fs-devel, Christoph Hellwig,
	linux-fsdevel

On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> On 08/17, Christoph Hellwig wrote:
> > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > Freeing preallocated blocks on error would be better than nothing, although note
> > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > so simply truncating past EOF would *not* be sufficient.
> > > 
> > > But really filesystems need to be designed to never expose uninitialized data,
> > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > f2fs apparently wasn't designed with that goal in mind.
> > > 
> > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > improvements until this data leakage bug can be solved one way or another.  If
> > > my patch to remove support for allocating writes isn't acceptable and the
> > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > Chao going to work on it?  I'm not sure there's much I can do here.
> > 
> > Btw, this is generally a problem for buffered I/O as well, although the
> > window for exposing uninitialized blocks on a crash tends to be smaller.
> 
> How about adding a warning message when we meet an error with preallocated
> unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> support? I feel that we only need to modify the preallocation and error
> handling parts?

A warning message would do nothing to prevent uninitialized blocks from being
leaked to userspace.

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17 20:27                       ` [f2fs-dev] " Eric Biggers
@ 2021-08-17 21:33                         ` Jaegeuk Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-17 21:33 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Christoph Hellwig, Chao Yu, Theodore Ts'o, linux-f2fs-devel,
	linux-fsdevel, stable

On 08/17, Eric Biggers wrote:
> On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> > On 08/17, Christoph Hellwig wrote:
> > > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > > Freeing preallocated blocks on error would be better than nothing, although note
> > > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > > so simply truncating past EOF would *not* be sufficient.
> > > > 
> > > > But really filesystems need to be designed to never expose uninitialized data,
> > > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > > f2fs apparently wasn't designed with that goal in mind.
> > > > 
> > > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > > improvements until this data leakage bug can be solved one way or another.  If
> > > > my patch to remove support for allocating writes isn't acceptable and the
> > > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > > Chao going to work on it?  I'm not sure there's much I can do here.
> > > 
> > > Btw, this is generally a problem for buffered I/O as well, although the
> > > window for exposing uninitialized blocks on a crash tends to be smaller.
> > 
> > How about adding a warning message when we meet an error with preallocated
> > unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> > support? I feel that we only need to modify the preallocation and error
> > handling parts?
> 
> A warning message would do nothing to prevent uninitialized blocks from being
> leaked to userspace.

To give a signal that it's a known issue that we'll fix later.

> 
> - Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-17 21:33                         ` Jaegeuk Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Jaegeuk Kim @ 2021-08-17 21:33 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Theodore Ts'o, stable, linux-f2fs-devel, Christoph Hellwig,
	linux-fsdevel

On 08/17, Eric Biggers wrote:
> On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> > On 08/17, Christoph Hellwig wrote:
> > > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > > Freeing preallocated blocks on error would be better than nothing, although note
> > > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > > so simply truncating past EOF would *not* be sufficient.
> > > > 
> > > > But really filesystems need to be designed to never expose uninitialized data,
> > > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > > f2fs apparently wasn't designed with that goal in mind.
> > > > 
> > > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > > improvements until this data leakage bug can be solved one way or another.  If
> > > > my patch to remove support for allocating writes isn't acceptable and the
> > > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > > Chao going to work on it?  I'm not sure there's much I can do here.
> > > 
> > > Btw, this is generally a problem for buffered I/O as well, although the
> > > window for exposing uninitialized blocks on a crash tends to be smaller.
> > 
> > How about adding a warning message when we meet an error with preallocated
> > unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> > support? I feel that we only need to modify the preallocation and error
> > handling parts?
> 
> A warning message would do nothing to prevent uninitialized blocks from being
> leaked to userspace.

To give a signal that it's a known issue that we'll fix later.

> 
> - Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17 21:33                         ` [f2fs-dev] " Jaegeuk Kim
@ 2021-08-18  0:06                           ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-18  0:06 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Christoph Hellwig, Chao Yu, Theodore Ts'o, linux-f2fs-devel,
	linux-fsdevel, stable

On Tue, Aug 17, 2021 at 02:33:01PM -0700, Jaegeuk Kim wrote:
> On 08/17, Eric Biggers wrote:
> > On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> > > On 08/17, Christoph Hellwig wrote:
> > > > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > > > Freeing preallocated blocks on error would be better than nothing, although note
> > > > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > > > so simply truncating past EOF would *not* be sufficient.
> > > > > 
> > > > > But really filesystems need to be designed to never expose uninitialized data,
> > > > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > > > f2fs apparently wasn't designed with that goal in mind.
> > > > > 
> > > > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > > > improvements until this data leakage bug can be solved one way or another.  If
> > > > > my patch to remove support for allocating writes isn't acceptable and the
> > > > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > > > Chao going to work on it?  I'm not sure there's much I can do here.
> > > > 
> > > > Btw, this is generally a problem for buffered I/O as well, although the
> > > > window for exposing uninitialized blocks on a crash tends to be smaller.
> > > 
> > > How about adding a warning message when we meet an error with preallocated
> > > unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> > > support? I feel that we only need to modify the preallocation and error
> > > handling parts?
> > 
> > A warning message would do nothing to prevent uninitialized blocks from being
> > leaked to userspace.
> 
> To give a signal that it's a known issue that we'll fix later.
> 

This bug is concerning mainly because it's a security vulnerability: anyone with
read+write access to just one file on an f2fs filesystem can effectively read
all other files on that filesystem.  A warning message wouldn't change that.

And even in the case of this bug breaking a non-malicious program, hardly anyone
reads kernel log messages anyway.  If something is broken, having a log message
that says "yeah, this is broken, sorry" isn't going to accomplish much...

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-18  0:06                           ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-18  0:06 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: Theodore Ts'o, stable, linux-f2fs-devel, Christoph Hellwig,
	linux-fsdevel

On Tue, Aug 17, 2021 at 02:33:01PM -0700, Jaegeuk Kim wrote:
> On 08/17, Eric Biggers wrote:
> > On Tue, Aug 17, 2021 at 11:57:46AM -0700, Jaegeuk Kim wrote:
> > > On 08/17, Christoph Hellwig wrote:
> > > > On Mon, Aug 16, 2021 at 07:03:21PM -0700, Eric Biggers wrote:
> > > > > Freeing preallocated blocks on error would be better than nothing, although note
> > > > > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > > > > so simply truncating past EOF would *not* be sufficient.
> > > > > 
> > > > > But really filesystems need to be designed to never expose uninitialized data,
> > > > > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > > > > f2fs apparently wasn't designed with that goal in mind.
> > > > > 
> > > > > In any case, I don't think we can proceed with any other f2fs direct I/O
> > > > > improvements until this data leakage bug can be solved one way or another.  If
> > > > > my patch to remove support for allocating writes isn't acceptable and the
> > > > > desired solution is going to require some more invasive f2fs surgery, are you or
> > > > > Chao going to work on it?  I'm not sure there's much I can do here.
> > > > 
> > > > Btw, this is generally a problem for buffered I/O as well, although the
> > > > window for exposing uninitialized blocks on a crash tends to be smaller.
> > > 
> > > How about adding a warning message when we meet an error with preallocated
> > > unwritten blocks? In the meantime, can we get the Eric's patches for iomap
> > > support? I feel that we only need to modify the preallocation and error
> > > handling parts?
> > 
> > A warning message would do nothing to prevent uninitialized blocks from being
> > leaked to userspace.
> 
> To give a signal that it's a known issue that we'll fix later.
> 

This bug is concerning mainly because it's a security vulnerability: anyone with
read+write access to just one file on an f2fs filesystem can effectively read
all other files on that filesystem.  A warning message wouldn't change that.

And even in the case of this bug breaking a non-malicious program, hardly anyone
reads kernel log messages anyway.  If something is broken, having a log message
that says "yeah, this is broken, sorry" isn't going to accomplish much...

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-17  2:03                 ` [f2fs-dev] " Eric Biggers
@ 2021-08-20  9:35                   ` Chao Yu
  -1 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-20  9:35 UTC (permalink / raw)
  To: Eric Biggers, Jaegeuk Kim
  Cc: Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On 2021/8/17 10:03, Eric Biggers wrote:
> On Mon, Aug 02, 2021 at 06:34:48PM -0700, Jaegeuk Kim wrote:
>> On 08/03, Chao Yu wrote:
>>> On 2021/8/3 2:23, Jaegeuk Kim wrote:
>>>> On 08/02, Chao Yu wrote:
>>>>> On 2021/8/2 12:39, Eric Biggers wrote:
>>>>>> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>>>>>>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>>>>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>>>>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>>>>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>>>>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>>>>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>>>>>>> doesn't fully complete.
>>>>>>>
>>>>>>> There's another way of solving this problem which doesn't require
>>>>>>> supporting unwritten blocks.  What a file system *could* do is to
>>>>>>> allocate the blocks, but *not* update the on-disk data structures ---
>>>>>>> so the allocation happens in memory only, so you know that the
>>>>>>> physical blocks won't get used for another files, and then issue the
>>>>>>> data block writes.  On the block I/O completion, trigger a workqueue
>>>>>>> function which updates the on-disk metadata to assign physical blocks
>>>>>>> to the inode.
>>>>>>>
>>>>>>> That way if you crash before the data I/O has a chance to complete,
>>>>>>> the on-disk logical block -> physical block map hasn't been updated
>>>>>>> yet, and so you don't need to worry about leaking uninitialized blocks.
>>>>>
>>>>> Thanks for your suggestion, I think it makes sense.
>>>>>
>>>>>>>
>>>>>>> Cheers,
>>>>>>>
>>>>>>> 					- Ted
>>>>>>
>>>>>> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
>>>>>
>>>>> Firstly, let's notice that below metadata will be touched during DIO
>>>>> preallocation flow:
>>>>> - log header
>>>>> - sit bitmap/count
>>>>> - free seg/sec bitmap/count
>>>>> - dirty seg/sec bitmap/count
>>>>>
>>>>> And there is one case we need to concern about is: checkpoint() can be
>>>>> triggered randomly in between dio_preallocate() and dio_end_io(), we should
>>>>> not persist any DIO preallocation related metadata during checkpoint(),
>>>>> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
>>>>>
>>>>> So it needs to well separate two kinds of metadata update:
>>>>> a) belong to dio preallocation
>>>>> b) the left one
>>>>>
>>>>> After that, it will simply checkpoint() flow to just flush metadata b), for
>>>>> other flow, like GC, data/node allocation, it needs to query/update metadata
>>>>> after we combine metadata a) and b).
>>>>>
>>>>> In addition, there is an existing in-memory log header framework in f2fs,
>>>>> based on this fwk, it's very easy for us to add a new in-memory log header
>>>>> for DIO preallocation.
>>>>>
>>>>> So it seems feasible for me until now...
>>>>>
>>>>> Jaegeuk, any other concerns about the implementation details?
>>>>
>>>> Hmm, I'm still trying to deal with this as a corner case where the writes
>>>> haven't completed due to an error. How about keeping the preallocated block
>>>> offsets and releasing them if we get an error? Do we need to handle EIO right?
>>>
>>> What about the case that CP + SPO following DIO preallocation? User will
>>> encounter uninitialized block after recovery.
>>
>> I think buffered writes as a workaround can expose the last unwritten block as
>> well, if SPO happens right after block allocation. We may need to compromise
>> at certain level?
>>
> 
> Freeing preallocated blocks on error would be better than nothing, although note
> that the preallocated blocks may have filled an arbitrary sequence of holes --
> so simply truncating past EOF would *not* be sufficient.
> 
> But really filesystems need to be designed to never expose uninitialized data,
> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> f2fs apparently wasn't designed with that goal in mind.
> 
> In any case, I don't think we can proceed with any other f2fs direct I/O
> improvements until this data leakage bug can be solved one way or another.  If
> my patch to remove support for allocating writes isn't acceptable and the
> desired solution is going to require some more invasive f2fs surgery, are you or
> Chao going to work on it?  I'm not sure there's much I can do here.

I may have time to take look into the implementation as I proposed above, maybe
just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
thoughts?

> 
> - Eric
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-20  9:35                   ` Chao Yu
  0 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-20  9:35 UTC (permalink / raw)
  To: Eric Biggers, Jaegeuk Kim
  Cc: linux-fsdevel, Theodore Ts'o, stable, linux-f2fs-devel

On 2021/8/17 10:03, Eric Biggers wrote:
> On Mon, Aug 02, 2021 at 06:34:48PM -0700, Jaegeuk Kim wrote:
>> On 08/03, Chao Yu wrote:
>>> On 2021/8/3 2:23, Jaegeuk Kim wrote:
>>>> On 08/02, Chao Yu wrote:
>>>>> On 2021/8/2 12:39, Eric Biggers wrote:
>>>>>> On Fri, Jul 30, 2021 at 10:46:16PM -0400, Theodore Ts'o wrote:
>>>>>>> On Fri, Jul 30, 2021 at 12:17:26PM -0700, Eric Biggers wrote:
>>>>>>>>> Currently, non-overwrite DIO writes are fundamentally unsafe on f2fs as
>>>>>>>>> they require preallocating blocks, but f2fs doesn't support unwritten
>>>>>>>>> blocks and therefore has to preallocate the blocks as regular blocks.
>>>>>>>>> f2fs has no way to reliably roll back such preallocations, so as a
>>>>>>>>> result, f2fs will leak uninitialized blocks to users if a DIO write
>>>>>>>>> doesn't fully complete.
>>>>>>>
>>>>>>> There's another way of solving this problem which doesn't require
>>>>>>> supporting unwritten blocks.  What a file system *could* do is to
>>>>>>> allocate the blocks, but *not* update the on-disk data structures ---
>>>>>>> so the allocation happens in memory only, so you know that the
>>>>>>> physical blocks won't get used for another files, and then issue the
>>>>>>> data block writes.  On the block I/O completion, trigger a workqueue
>>>>>>> function which updates the on-disk metadata to assign physical blocks
>>>>>>> to the inode.
>>>>>>>
>>>>>>> That way if you crash before the data I/O has a chance to complete,
>>>>>>> the on-disk logical block -> physical block map hasn't been updated
>>>>>>> yet, and so you don't need to worry about leaking uninitialized blocks.
>>>>>
>>>>> Thanks for your suggestion, I think it makes sense.
>>>>>
>>>>>>>
>>>>>>> Cheers,
>>>>>>>
>>>>>>> 					- Ted
>>>>>>
>>>>>> Jaegeuk and Chao, any idea how feasible it would be for f2fs to do this?
>>>>>
>>>>> Firstly, let's notice that below metadata will be touched during DIO
>>>>> preallocation flow:
>>>>> - log header
>>>>> - sit bitmap/count
>>>>> - free seg/sec bitmap/count
>>>>> - dirty seg/sec bitmap/count
>>>>>
>>>>> And there is one case we need to concern about is: checkpoint() can be
>>>>> triggered randomly in between dio_preallocate() and dio_end_io(), we should
>>>>> not persist any DIO preallocation related metadata during checkpoint(),
>>>>> otherwise, sudden power-cut after the checkpoint will corrupt filesytem.
>>>>>
>>>>> So it needs to well separate two kinds of metadata update:
>>>>> a) belong to dio preallocation
>>>>> b) the left one
>>>>>
>>>>> After that, it will simply checkpoint() flow to just flush metadata b), for
>>>>> other flow, like GC, data/node allocation, it needs to query/update metadata
>>>>> after we combine metadata a) and b).
>>>>>
>>>>> In addition, there is an existing in-memory log header framework in f2fs,
>>>>> based on this fwk, it's very easy for us to add a new in-memory log header
>>>>> for DIO preallocation.
>>>>>
>>>>> So it seems feasible for me until now...
>>>>>
>>>>> Jaegeuk, any other concerns about the implementation details?
>>>>
>>>> Hmm, I'm still trying to deal with this as a corner case where the writes
>>>> haven't completed due to an error. How about keeping the preallocated block
>>>> offsets and releasing them if we get an error? Do we need to handle EIO right?
>>>
>>> What about the case that CP + SPO following DIO preallocation? User will
>>> encounter uninitialized block after recovery.
>>
>> I think buffered writes as a workaround can expose the last unwritten block as
>> well, if SPO happens right after block allocation. We may need to compromise
>> at certain level?
>>
> 
> Freeing preallocated blocks on error would be better than nothing, although note
> that the preallocated blocks may have filled an arbitrary sequence of holes --
> so simply truncating past EOF would *not* be sufficient.
> 
> But really filesystems need to be designed to never expose uninitialized data,
> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> f2fs apparently wasn't designed with that goal in mind.
> 
> In any case, I don't think we can proceed with any other f2fs direct I/O
> improvements until this data leakage bug can be solved one way or another.  If
> my patch to remove support for allocating writes isn't acceptable and the
> desired solution is going to require some more invasive f2fs surgery, are you or
> Chao going to work on it?  I'm not sure there's much I can do here.

I may have time to take look into the implementation as I proposed above, maybe
just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
thoughts?

> 
> - Eric
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-20  9:35                   ` [f2fs-dev] " Chao Yu
@ 2021-08-20 18:11                     ` Eric Biggers
  -1 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-20 18:11 UTC (permalink / raw)
  To: Chao Yu
  Cc: Jaegeuk Kim, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On Fri, Aug 20, 2021 at 05:35:21PM +0800, Chao Yu wrote:
> > > > > 
> > > > > Hmm, I'm still trying to deal with this as a corner case where the writes
> > > > > haven't completed due to an error. How about keeping the preallocated block
> > > > > offsets and releasing them if we get an error? Do we need to handle EIO right?
> > > > 
> > > > What about the case that CP + SPO following DIO preallocation? User will
> > > > encounter uninitialized block after recovery.
> > > 
> > > I think buffered writes as a workaround can expose the last unwritten block as
> > > well, if SPO happens right after block allocation. We may need to compromise
> > > at certain level?
> > > 
> > 
> > Freeing preallocated blocks on error would be better than nothing, although note
> > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > so simply truncating past EOF would *not* be sufficient.
> > 
> > But really filesystems need to be designed to never expose uninitialized data,
> > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > f2fs apparently wasn't designed with that goal in mind.
> > 
> > In any case, I don't think we can proceed with any other f2fs direct I/O
> > improvements until this data leakage bug can be solved one way or another.  If
> > my patch to remove support for allocating writes isn't acceptable and the
> > desired solution is going to require some more invasive f2fs surgery, are you or
> > Chao going to work on it?  I'm not sure there's much I can do here.
> 
> I may have time to take look into the implementation as I proposed above, maybe
> just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
> thoughts?
> 

What does this have to do with fsync?

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-20 18:11                     ` Eric Biggers
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Biggers @ 2021-08-20 18:11 UTC (permalink / raw)
  To: Chao Yu
  Cc: linux-fsdevel, Jaegeuk Kim, Theodore Ts'o, stable, linux-f2fs-devel

On Fri, Aug 20, 2021 at 05:35:21PM +0800, Chao Yu wrote:
> > > > > 
> > > > > Hmm, I'm still trying to deal with this as a corner case where the writes
> > > > > haven't completed due to an error. How about keeping the preallocated block
> > > > > offsets and releasing them if we get an error? Do we need to handle EIO right?
> > > > 
> > > > What about the case that CP + SPO following DIO preallocation? User will
> > > > encounter uninitialized block after recovery.
> > > 
> > > I think buffered writes as a workaround can expose the last unwritten block as
> > > well, if SPO happens right after block allocation. We may need to compromise
> > > at certain level?
> > > 
> > 
> > Freeing preallocated blocks on error would be better than nothing, although note
> > that the preallocated blocks may have filled an arbitrary sequence of holes --
> > so simply truncating past EOF would *not* be sufficient.
> > 
> > But really filesystems need to be designed to never expose uninitialized data,
> > even if I/O errors or a sudden power failure occurs.  It is unfortunate that
> > f2fs apparently wasn't designed with that goal in mind.
> > 
> > In any case, I don't think we can proceed with any other f2fs direct I/O
> > improvements until this data leakage bug can be solved one way or another.  If
> > my patch to remove support for allocating writes isn't acceptable and the
> > desired solution is going to require some more invasive f2fs surgery, are you or
> > Chao going to work on it?  I'm not sure there's much I can do here.
> 
> I may have time to take look into the implementation as I proposed above, maybe
> just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
> thoughts?
> 

What does this have to do with fsync?

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] f2fs: remove broken support for allocating DIO writes
  2021-08-20 18:11                     ` [f2fs-dev] " Eric Biggers
@ 2021-08-20 22:01                       ` Chao Yu
  -1 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-20 22:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Jaegeuk Kim, Theodore Ts'o, linux-f2fs-devel, linux-fsdevel, stable

On 2021/8/21 2:11, Eric Biggers wrote:
> On Fri, Aug 20, 2021 at 05:35:21PM +0800, Chao Yu wrote:
>>>>>>
>>>>>> Hmm, I'm still trying to deal with this as a corner case where the writes
>>>>>> haven't completed due to an error. How about keeping the preallocated block
>>>>>> offsets and releasing them if we get an error? Do we need to handle EIO right?
>>>>>
>>>>> What about the case that CP + SPO following DIO preallocation? User will
>>>>> encounter uninitialized block after recovery.
>>>>
>>>> I think buffered writes as a workaround can expose the last unwritten block as
>>>> well, if SPO happens right after block allocation. We may need to compromise
>>>> at certain level?
>>>>
>>>
>>> Freeing preallocated blocks on error would be better than nothing, although note
>>> that the preallocated blocks may have filled an arbitrary sequence of holes --
>>> so simply truncating past EOF would *not* be sufficient.
>>>
>>> But really filesystems need to be designed to never expose uninitialized data,
>>> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
>>> f2fs apparently wasn't designed with that goal in mind.
>>>
>>> In any case, I don't think we can proceed with any other f2fs direct I/O
>>> improvements until this data leakage bug can be solved one way or another.  If
>>> my patch to remove support for allocating writes isn't acceptable and the
>>> desired solution is going to require some more invasive f2fs surgery, are you or
>>> Chao going to work on it?  I'm not sure there's much I can do here.
>>
>> I may have time to take look into the implementation as I proposed above, maybe
>> just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
>> thoughts?
>>
> 
> What does this have to do with fsync?

Oops, maybe a separate option is more appropriate.

> 
> - Eric
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: remove broken support for allocating DIO writes
@ 2021-08-20 22:01                       ` Chao Yu
  0 siblings, 0 replies; 42+ messages in thread
From: Chao Yu @ 2021-08-20 22:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-fsdevel, Jaegeuk Kim, Theodore Ts'o, stable, linux-f2fs-devel

On 2021/8/21 2:11, Eric Biggers wrote:
> On Fri, Aug 20, 2021 at 05:35:21PM +0800, Chao Yu wrote:
>>>>>>
>>>>>> Hmm, I'm still trying to deal with this as a corner case where the writes
>>>>>> haven't completed due to an error. How about keeping the preallocated block
>>>>>> offsets and releasing them if we get an error? Do we need to handle EIO right?
>>>>>
>>>>> What about the case that CP + SPO following DIO preallocation? User will
>>>>> encounter uninitialized block after recovery.
>>>>
>>>> I think buffered writes as a workaround can expose the last unwritten block as
>>>> well, if SPO happens right after block allocation. We may need to compromise
>>>> at certain level?
>>>>
>>>
>>> Freeing preallocated blocks on error would be better than nothing, although note
>>> that the preallocated blocks may have filled an arbitrary sequence of holes --
>>> so simply truncating past EOF would *not* be sufficient.
>>>
>>> But really filesystems need to be designed to never expose uninitialized data,
>>> even if I/O errors or a sudden power failure occurs.  It is unfortunate that
>>> f2fs apparently wasn't designed with that goal in mind.
>>>
>>> In any case, I don't think we can proceed with any other f2fs direct I/O
>>> improvements until this data leakage bug can be solved one way or another.  If
>>> my patch to remove support for allocating writes isn't acceptable and the
>>> desired solution is going to require some more invasive f2fs surgery, are you or
>>> Chao going to work on it?  I'm not sure there's much I can do here.
>>
>> I may have time to take look into the implementation as I proposed above, maybe
>> just enabling this in FSYNC_MODE_STRICT mode if user concerns unwritten data?
>> thoughts?
>>
> 
> What does this have to do with fsync?

Oops, maybe a separate option is more appropriate.

> 
> - Eric
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2021-08-20 22:01 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28  1:51 [PATCH] f2fs: remove broken support for allocating DIO writes Eric Biggers
2021-07-28  1:51 ` [f2fs-dev] " Eric Biggers
2021-07-30 19:17 ` Eric Biggers
2021-07-30 19:17   ` [f2fs-dev] " Eric Biggers
2021-07-30 22:12   ` Jaegeuk Kim
2021-07-30 22:12     ` [f2fs-dev] " Jaegeuk Kim
2021-07-30 22:19     ` Eric Biggers
2021-07-30 22:19       ` [f2fs-dev] " Eric Biggers
2021-07-31  1:05       ` Jaegeuk Kim
2021-07-31  1:05         ` [f2fs-dev] " Jaegeuk Kim
2021-07-31  1:18         ` Eric Biggers
2021-07-31  1:18           ` [f2fs-dev] " Eric Biggers
2021-07-31  2:46   ` Theodore Ts'o
2021-07-31  2:46     ` [f2fs-dev] " Theodore Ts'o
2021-08-02  4:39     ` Eric Biggers
2021-08-02  4:39       ` [f2fs-dev] " Eric Biggers
2021-08-02  9:00       ` Chao Yu
2021-08-02  9:00         ` [f2fs-dev] " Chao Yu
2021-08-02 18:23         ` Jaegeuk Kim
2021-08-02 18:23           ` [f2fs-dev] " Jaegeuk Kim
2021-08-03  1:19           ` Chao Yu
2021-08-03  1:19             ` [f2fs-dev] " Chao Yu
2021-08-03  1:34             ` Jaegeuk Kim
2021-08-03  1:34               ` [f2fs-dev] " Jaegeuk Kim
2021-08-17  2:03               ` Eric Biggers
2021-08-17  2:03                 ` [f2fs-dev] " Eric Biggers
2021-08-17  5:42                 ` Christoph Hellwig
2021-08-17  5:42                   ` [f2fs-dev] " Christoph Hellwig
2021-08-17 18:57                   ` Jaegeuk Kim
2021-08-17 18:57                     ` [f2fs-dev] " Jaegeuk Kim
2021-08-17 20:27                     ` Eric Biggers
2021-08-17 20:27                       ` [f2fs-dev] " Eric Biggers
2021-08-17 21:33                       ` Jaegeuk Kim
2021-08-17 21:33                         ` [f2fs-dev] " Jaegeuk Kim
2021-08-18  0:06                         ` Eric Biggers
2021-08-18  0:06                           ` [f2fs-dev] " Eric Biggers
2021-08-20  9:35                 ` Chao Yu
2021-08-20  9:35                   ` [f2fs-dev] " Chao Yu
2021-08-20 18:11                   ` Eric Biggers
2021-08-20 18:11                     ` [f2fs-dev] " Eric Biggers
2021-08-20 22:01                     ` Chao Yu
2021-08-20 22:01                       ` [f2fs-dev] " Chao Yu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.